diff --git a/.github/workflows/archiver.yml b/.github/workflows/archiver.yml
index 3aaf9b35ed..5ac17d45a2 100644
--- a/.github/workflows/archiver.yml
+++ b/.github/workflows/archiver.yml
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Sep 2024) for the MG5aMC CUDACPP plugin.
-# Further modified by: D. Massaro, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: D. Massaro, A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
 
 #----------------------------------------------------------------------------------------------------------------------------------
 
diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 7dd6a2f963..72ffe64b17 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -1,3 +1,8 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, S. Roiser, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
+
 name: C/C++ CI
 
 on:
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
index 6a4b946e74..086aa6a616 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index 703ea3781c..5ede45b123 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
 // Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -193,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -208,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -220,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -314,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -341,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -363,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -385,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -403,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
index 8da04d7945..16f8874888 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
 // Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
@@ -8,9 +8,12 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
index deddc425f5..936ef7a7ff 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_%(model_name)s_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc
new file mode 100644
index 0000000000..d2b24bba27
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc
@@ -0,0 +1,418 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+%(color_matrix_lines)s
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 32d12a5bba..22acd3abe9 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -479,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -599,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -782,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -801,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -834,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -878,6 +931,7 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
 	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
@@ -979,6 +1033,7 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
 	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
index b4df265133..48b2037dc2 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 68bbf1b934..c32d0a2740 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
index 444c848e10..4c35c3eec6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
@@ -14,6 +14,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 %(hel_amps_h)s
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -23,6 +24,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
index 4e5e942a41..b0f0b44e26 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
@@ -50,6 +50,7 @@
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = %(nbhel)d; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = %(ndiagrams)d; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = %(ncolor)s; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
index 76b6e773bd..0665bfb93b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -1,7 +1,7 @@
 ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 ! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend.
 !==========================================================================
-! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Copyright (C) 2020-2025 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
 ! Further modified by: J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
@@ -16,9 +16,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -26,10 +27,7 @@ namespace mg5amcCpu
   using Parameters_%(model_name)s_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_%(model_name)s_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = %(ncolor)s;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -88,12 +86,58 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
@@ -117,8 +161,10 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
+
+    // Enable SIGFPE traps for Floating Point Exceptions
 #ifdef MGONGPUCPP_DEBUG
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+    fpeEnable();
 #endif
   }
 
@@ -148,6 +194,10 @@ namespace mg5amcCpu
       //m_pars->printDependentCouplings(); // now computed event-by-event (running alphas #373)
     }
     %(initProc_lines)s
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     %(cipdassign)s
@@ -183,6 +233,10 @@ namespace mg5amcCpu
       //Parameters_%(model_name)s::printDependentCouplings(); // now computed event-by-event (running alphas #373)
     }
     %(hardcoded_initProc_lines)s
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -303,8 +357,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -312,25 +366,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -343,7 +413,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -353,26 +423,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV %% 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV %% 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -385,15 +456,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -430,33 +508,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%%4d rndhel=%%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%%4d ihel=%%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -475,13 +707,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { %(den_factors)s }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
index 895b7ec1d6..7de8886b1d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
@@ -17,6 +17,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_%(model_name)s.h"
 
 #include <vector>
@@ -46,7 +47,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -54,9 +55,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -76,34 +79,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
index 2700d7e7da..aac7506855 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -8,145 +8,43 @@
 !==========================================================================
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called %(process_class_name)s::matrix_%(proc_name)s(%(matrix_args)s)?)
-%(color_matrix_lines)s
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
-#else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
index d49047a623..4372edde52 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
@@ -6,18 +6,23 @@
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
 ! Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 !==========================================================================
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -42,93 +47,30 @@
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%%4d rndhel=%%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -170,7 +112,7 @@
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -193,7 +135,7 @@
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -202,21 +144,23 @@
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -230,8 +174,10 @@
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -247,11 +193,12 @@
         //printf( "sigmaKin: ievt=%%4d rndhel=%%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -353,14 +300,15 @@
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
index 7d7996a674..3f8a85afa6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
@@ -1308,33 +1308,43 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
             self.couplings2order = self.helas_call_writer.couplings2order
             self.params2order = self.helas_call_writer.params2order
             ret_lines.append("""
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -1346,7 +1356,6 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -1355,14 +1364,17 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\\n", ihel );""")
+    //if( debug ) printf( \"calculate_jamps: ievt00=%d ihel=%2d\\n\", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( \"calculate_jamps: ievt=%6d ihel=%2d\\n\", ievt, ihel );
+#endif /* clang-format on */""")
             nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions()
             ret_lines.append("""
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
@@ -1389,14 +1401,10 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif""")
@@ -1413,6 +1421,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
             file = self.get_matrix_single_process( i, me, color_amplitudes[i], class_name )
             file = '\n'.join( file.split('\n')[8:] ) # skip first 8 lines in process_matrix.inc (copyright)
             file_extend.append( file )
+            assert i == 0, "more than one ME in get_all_sigmaKin_lines" # AV sanity check (added for color_sum.cc but valid independently)
         ret_lines.extend( file_extend )
         return '\n'.join(ret_lines)
 
@@ -1442,7 +1451,7 @@ def generate_process_files(self):
         self.edit_check_sa()
         self.edit_mgonGPU()
         self.edit_processidfile() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses)
-        
+        self.edit_colorsum() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses)
         self.edit_testxxx() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific)
         self.edit_memorybuffers() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific)
         self.edit_memoryaccesscouplings() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific)
@@ -1523,6 +1532,17 @@ def edit_processidfile(self):
         ff.write(template % replace_dict)
         ff.close()
 
+    # AV - new method
+    def edit_colorsum(self):
+        """Generate color_sum.cc"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_colorsum')
+        template = open(pjoin(self.template_path,'gpu','color_sum.cc'),'r').read()
+        replace_dict = {}
+        # Extract color matrix again (this was also in get_matrix_single_process called within get_all_sigmaKin_lines)
+        replace_dict['color_matrix_lines'] = self.get_color_matrix_lines(self.matrix_elements[0])
+        ff = open(pjoin(self.path, 'color_sum.cc'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
 
     def generate_subprocess_directory_end(self, **opt):
         """ opt contain all local variable of the fortran original function"""
@@ -1693,11 +1713,11 @@ def get_color_matrix_lines(self, matrix_element):
         """Return the color matrix definition lines for this matrix element. Split rows in chunks of size n."""
         import madgraph.core.color_algebra as color
         if not matrix_element.get('color_matrix'):
-            return '\n'.join(['      static constexpr fptype2 denom[1] = {1.};', 'static const fptype2 cf[1][1] = {1.};'])
+            return '\n'.join(['  static constexpr fptype2 colorDenom[1] = {1.};', 'static const fptype2 cf[1][1] = {1.};'])
         else:
             color_denominators = matrix_element.get('color_matrix').\
                                                  get_line_denominators()
-            denom_string = '      static constexpr fptype2 denom[ncolor] = { %s }; // 1-D array[%i]' \
+            denom_string = '  static constexpr fptype2 colorDenom[ncolor] = { %s }; // 1-D array[%i]' \
                            % ( ', '.join(['%i' % denom for denom in color_denominators]), len(color_denominators) )
             matrix_strings = []
             my_cs = color.ColorString()
@@ -1705,12 +1725,12 @@ def get_color_matrix_lines(self, matrix_element):
                 # Then write the numerators for the matrix elements
                 num_list = matrix_element.get('color_matrix').get_line_numerators(index, denominator)
                 matrix_strings.append('{ %s }' % ', '.join(['%d' % i for i in num_list]))
-            matrix_string = '      static constexpr fptype2 cf[ncolor][ncolor] = '
-            if len( matrix_strings ) > 1 : matrix_string += '{\n        ' + ',\n        '.join(matrix_strings) + ' };'
+            matrix_string = '  static constexpr fptype2 colorMatrix[ncolor][ncolor] = '
+            if len( matrix_strings ) > 1 : matrix_string += '{\n    ' + ',\n    '.join(matrix_strings) + ' };'
             else: matrix_string += '{ ' + matrix_strings[0] + ' };'
             matrix_string += ' // 2-D array[%i][%i]' % ( len(color_denominators), len(color_denominators) )
-            denom_comment = '\n      // The color denominators (initialize all array elements, with ncolor=%i)\n      // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators)
-            matrix_comment = '\n      // The color matrix (initialize all array elements, with ncolor=%i)\n      // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators)
+            denom_comment = '\n  // The color denominators (initialize all array elements, with ncolor=%i)\n  // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators)
+            matrix_comment = '\n  // The color matrix (initialize all array elements, with ncolor=%i)\n  // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators)
             denom_string = denom_comment + denom_string
             matrix_string = matrix_comment + matrix_string
             return '\n'.join([denom_string, matrix_string])
@@ -1905,7 +1925,6 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -1919,7 +1938,6 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -1930,6 +1948,10 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index 6562d189da..e54290d5a7 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -103,6 +103,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                              s+'CMake/src/CMakeLists.txt' ],
                      'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h',
                                       s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h',
+                                      s+'gpu/color_sum.h',
                                       s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h',
                                       s+'gpu/MemoryAccessMatrixElements.h', s+'gpu/MemoryAccessMomenta.h',
                                       s+'gpu/MemoryAccessRandomNumbers.h', s+'gpu/MemoryAccessWeights.h',
@@ -127,6 +128,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
 
     to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h',
                     'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h',
+                    'color_sum.h',
                     'MemoryAccessHelpers.h', 'MemoryAccessVectors.h',
                     'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h',
                     'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h',
diff --git a/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh b/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh
index 097935efc8..00004df108 100755
--- a/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh
@@ -8,38 +8,69 @@ set -e # fail on error
 
 cd $(dirname $0)/..
 
-./CODEGEN/generateAndCompare.sh -q ee_mumu
-./CODEGEN/generateAndCompare.sh -q ee_mumu --mad
+bsm=
+while [ "$1" != "" ]; do
+  if [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then
+    bsm=$1
+    shift
+  elif [ "$1" == "-nobsm" ] && [ "$bsm" != "-bsmonly" ]; then
+    bsm=$1
+    shift
+  else
+    echo "Usage: $0 [-bsmonly|-nobsm]"
+  fi
+done
 
-./CODEGEN/generateAndCompare.sh -q gg_tt
-./CODEGEN/generateAndCompare.sh -q gg_tt --mad
+# SM processes (both mad and sa)
 
-./CODEGEN/generateAndCompare.sh -q gg_ttg
-./CODEGEN/generateAndCompare.sh -q gg_ttg --mad
+if [ "${bsm}" != "-bsmonly" ]; then
 
-./CODEGEN/generateAndCompare.sh -q gg_ttgg
-./CODEGEN/generateAndCompare.sh -q gg_ttgg --mad
+  ./CODEGEN/generateAndCompare.sh -q ee_mumu
+  ./CODEGEN/generateAndCompare.sh -q ee_mumu --mad
 
-./CODEGEN/generateAndCompare.sh -q gg_ttggg
-./CODEGEN/generateAndCompare.sh -q gg_ttggg --mad
+  ./CODEGEN/generateAndCompare.sh -q gg_tt
+  ./CODEGEN/generateAndCompare.sh -q gg_tt --mad
 
-./CODEGEN/generateAndCompare.sh -q gq_ttq
-./CODEGEN/generateAndCompare.sh -q gq_ttq --mad
+  ./CODEGEN/generateAndCompare.sh -q gg_ttg
+  ./CODEGEN/generateAndCompare.sh -q gg_ttg --mad
 
-./CODEGEN/generateAndCompare.sh -q heft_gg_bb
-./CODEGEN/generateAndCompare.sh -q heft_gg_bb --mad
+  ./CODEGEN/generateAndCompare.sh -q gg_ttgg
+  ./CODEGEN/generateAndCompare.sh -q gg_ttgg --mad
 
-./CODEGEN/generateAndCompare.sh -q susy_gg_tt
-./CODEGEN/generateAndCompare.sh -q susy_gg_tt --mad
+  ./CODEGEN/generateAndCompare.sh -q gg_ttggg
+  ./CODEGEN/generateAndCompare.sh -q gg_ttggg --mad
 
-./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1
-./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 --mad
+  ./CODEGEN/generateAndCompare.sh -q gq_ttq
+  ./CODEGEN/generateAndCompare.sh -q gq_ttq --mad
 
-./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt
-./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt --mad
+fi
 
-./CODEGEN/generateAndCompare.sh -q nobm_pp_ttW --mad
+# BSM processes
 
-./CODEGEN/generateAndCompare.sh -q gg_tt01g --mad
+if [ "${bsm}" != "-nobsm" ]; then
 
-./CODEGEN/generateAndCompare.sh -q pp_tt012j --mad
+  ./CODEGEN/generateAndCompare.sh -q heft_gg_bb
+  ./CODEGEN/generateAndCompare.sh -q heft_gg_bb --mad
+
+  ./CODEGEN/generateAndCompare.sh -q susy_gg_tt
+  ./CODEGEN/generateAndCompare.sh -q susy_gg_tt --mad
+
+  ./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1
+  ./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 --mad
+
+  ./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt
+  ./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt --mad
+
+  ./CODEGEN/generateAndCompare.sh -q nobm_pp_ttW --mad
+
+fi
+
+# SM processes (mad only)
+
+if [ "${bsm}" != "-bsmonly" ]; then
+
+  ./CODEGEN/generateAndCompare.sh -q gg_tt01g --mad
+
+  ./CODEGEN/generateAndCompare.sh -q pp_tt012j --mad
+
+fi
diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
index 6221b1cfee..fd46fd38f3 100755
--- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
@@ -1,8 +1,8 @@
-#!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+#!/usr/bin/env bash
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 set -e # fail on error
 
@@ -358,10 +358,12 @@ function codeGenAndDiff()
         fi
       done
     fi
-    # Remove card.jpg, diagrams.html and matrix*.jpg files (NB: these are only created if ghostscript is installed)
+    # Remove card.jpg/png, diagrams.html and matrix*.jpg/png files (NB: these are only created if ghostscript is installed)
     \rm -f ${outproc}/SubProcesses/P*/card.jpg
+    \rm -f ${outproc}/SubProcesses/P*/card.png
     \rm -f ${outproc}/SubProcesses/P*/diagrams.html
     \rm -f ${outproc}/SubProcesses/P*/matrix*jpg
+    \rm -f ${outproc}/SubProcesses/P*/matrix*png
     # Cleanup
     \rm -f ${outproc}/crossx.html
     \rm -f ${outproc}/index.html
@@ -474,13 +476,6 @@ EOF
     if $SCRDIR/diffCode.sh ${BRIEF} -r -c ${proc}.${autosuffix}.BKP ${proc}.${autosuffix}; then echo "Old and new generated codes are identical"; else echo -e "\nWARNING! Old and new generated codes differ"; fi
     popd >& /dev/null
   fi
-  # Compare the existing manually developed code to the newly generated code for the specific process
-  if [ "${OUTBCK}" == "cudacpp" ] || [ "${OUTBCK}" == "gridpack" ]; then
-    pushd ${OUTDIR} >& /dev/null
-    echo -e "\n+++ Compare manually developed code to newly generated code for $proc\n"
-    if $SCRDIR/diffCode.sh ${BRIEF} -r -c ${proc} ${proc}.${autosuffix}; then echo "Manual and generated codes are identical"; else echo -e "\nWARNING! Manual and generated codes differ"; fi
-    popd >& /dev/null
-  fi
   # Print a summary of the available code
   if [ "$QUIET" != "1" ]; then
     echo
diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index db84a9053c..f41ae1e58f 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +57,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006200551986694336 [0m
+[1;32mDEBUG: model prefixing  takes 0.008341312408447266 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,21 +149,21 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.004 s
+1 processes with 2 diagrams generated in 0.003 s
 Total: 1 processes with 2 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
@@ -176,22 +175,22 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.070 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.005 s
+Wrote files for 8 helas calls in 0.063 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.201 s
+ALOHA: aloha creates 3 routines in  0.181 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.253 s
+ALOHA: aloha creates 7 routines in  0.189 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -200,38 +199,32 @@ ALOHA: aloha creates 7 routines in  0.253 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #2 succeeded at 236 (offset 9 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.054s
-user	0m1.767s
-sys	0m0.275s
-Code generation completed in 2 seconds
+real	0m2.357s
+user	0m1.875s
+sys	0m0.386s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -244,7 +237,7 @@ Code generation completed in 2 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -252,10 +245,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -274,7 +266,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -282,10 +274,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
+++ b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
index bb623f867a..2343b09819 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
index 74f70b567b..c1037c83d7 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
@@ -112,6 +112,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
index 68ee164d00..4ba7540657 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
@@ -112,6 +112,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/.make_opts b/epochX/cudacpp/ee_mumu.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/.make_opts
+++ b/epochX/cudacpp/ee_mumu.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f b/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc b/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts
+++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/makefile b/epochX/cudacpp/ee_mumu.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/makefile
+++ b/epochX/cudacpp/ee_mumu.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc b/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc
index 80d5ae41aa..83061d9ae9 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 7bd57a8dbb..624eb3e3d4 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 1;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -354,152 +410,43 @@ namespace mg5amcCpu
       jamp_sv[0] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_epem_mupmum()?)
-
-      // The color denominators (initialize all array elements, with ncolor=1)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1 }; // 1-D array[1]
-
-      // The color matrix (initialize all array elements, with ncolor=1)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -539,7 +486,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -572,6 +523,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
@@ -613,6 +568,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -733,8 +692,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -742,25 +701,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -773,7 +748,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -783,26 +758,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -815,15 +791,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -860,33 +843,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -905,13 +1042,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -923,18 +1054,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -959,93 +1095,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1087,7 +1160,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1110,7 +1183,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1119,21 +1192,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1147,8 +1222,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1164,11 +1241,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1270,14 +1348,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index 159826a904..9339b0e34c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 1; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
index 70fe04e4d8..3ce157a97e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
index 280eff025e..60bee2a1c7 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         EP1=PDG2PDF(LPP(IB(1)),-11, IB(1),XBK(IB(1)), QSCALE)
         IF (PDLABEL.EQ.'dressed') EP1_COMPONENTS(1:4 ) =
@@ -149,7 +149,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         EM2=PDG2PDF(LPP(IB(2)),11, IB(2),XBK(IB(2)), QSCALE)
         IF (PDLABEL.EQ.'dressed') EM2_COMPONENTS(1:4 ) =
@@ -228,7 +228,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -302,6 +302,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -385,14 +389,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             EP1(IVEC)=PDG2PDF(LPP(IB(1)),-11, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             IF (PDLABEL.EQ.'dressed') EP1_COMPONENTS(1:4 , IVEC) =
      $        EE_COMPONENTS(1:4)
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             EM2(IVEC)=PDG2PDF(LPP(IB(2)),11, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             IF (PDLABEL.EQ.'dressed') EM2_COMPONENTS(1:4 , IVEC) =
      $        EE_COMPONENTS(1:4)
           ENDIF
@@ -460,51 +464,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc
new file mode 100644
index 0000000000..44aadd6b60
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc
@@ -0,0 +1,425 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=1)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1 }; // 1-D array[1]
+
+  // The color matrix (initialize all array elements, with ncolor=1)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/configs.inc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/configs.inc
index b17a3fe72a..e42ad21d89 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/configs.inc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/configs.inc
@@ -12,3 +12,5 @@ C     Diagram 2
       DATA TPRID(-1,2)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/2/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f
index ec5722702a..30cca27587 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fbridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/makefile_original.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
index 1a2e5df4e6..7cc484494b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -236,17 +233,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -325,7 +311,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -368,7 +354,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -411,17 +398,22 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  1) /1.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  1) /1/
 C     1 ColorOne()
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WZ.NE.0D0) FK_MDL_WZ = SIGN(MAX(ABS(MDL_WZ), ABS(MDL_MZ
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WZ)
+        FK_ZERO = 0D0
+        IF(MDL_WZ.NE.0D0) THEN
+          FK_MDL_WZ = SIGN(MAX(ABS(MDL_WZ), ABS(MDL_MZ
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WZ)
+        ELSE
+          FK_MDL_WZ = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -455,10 +447,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -467,6 +461,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data b/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/madevent b/epochX/cudacpp/ee_mumu.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/madevent
+++ b/epochX/cudacpp/ee_mumu.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
index 18f664e0d1..4dd98afc5d 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
index 37676c1d8d..dd3280eb5d 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
index 5fcde71f6b..0c43310313 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index f27925604a..96bc83705d 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +57,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006340742111206055 [0m
+[1;32mDEBUG: model prefixing  takes 0.004626750946044922 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -154,28 +153,28 @@ INFO: Process has 2 diagrams
 Total: 1 processes with 2 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.267 s
+ALOHA: aloha creates 4 routines in  0.213 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -184,17 +183,17 @@ ALOHA: aloha creates 4 routines in  0.267 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 quit
 
-real	0m0.659s
-user	0m0.589s
-sys	0m0.056s
+real	0m0.603s
+user	0m0.531s
+sys	0m0.067s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
+++ b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
index 16a91dd141..42f5c25dcb 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 1;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -352,152 +408,43 @@ namespace mg5amcCpu
       jamp_sv[0] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_epem_mupmum()?)
-
-      // The color denominators (initialize all array elements, with ncolor=1)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1 }; // 1-D array[1]
-
-      // The color matrix (initialize all array elements, with ncolor=1)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -537,7 +484,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -570,6 +521,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
@@ -611,6 +566,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -731,8 +690,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -740,25 +699,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -771,7 +746,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -781,26 +756,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -813,15 +789,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -858,33 +841,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -903,13 +1040,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -921,18 +1052,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -957,93 +1093,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1085,7 +1158,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1108,7 +1181,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1117,21 +1190,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1145,8 +1220,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1162,11 +1239,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1268,14 +1346,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
index 159826a904..9339b0e34c 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 1; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc
new file mode 100644
index 0000000000..44aadd6b60
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc
@@ -0,0 +1,425 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=1)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1 }; // 1-D array[1]
+
+  // The color matrix (initialize all array elements, with ncolor=1)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fbridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/makefile_original.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
index 18f664e0d1..4dd98afc5d 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
index 37676c1d8d..dd3280eb5d 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
index 5fcde71f6b..0c43310313 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
index d3c4ca5695..7d34de72f8 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 453da8d298..f28f5709d8 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +57,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0065233707427978516 [0m
+[1;32mDEBUG: model prefixing  takes 0.004693746566772461 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,21 +150,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.009 s
+1 processes with 3 diagrams generated in 0.007 s
 Total: 1 processes with 3 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -177,53 +176,48 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1577][0m [0m
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.072 s
+Wrote files for 10 helas calls in 0.064 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.144 s
+ALOHA: aloha creates 2 routines in  0.115 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.132 s
+ALOHA: aloha creates 4 routines in  0.131 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.991s
-user	0m1.616s
-sys	0m0.275s
+real	0m2.156s
+user	0m1.749s
+sys	0m0.401s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -237,7 +231,7 @@ Code generation completed in 2 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -245,10 +239,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -267,7 +260,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -275,10 +268,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
index 66598786f5..404258ce86 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
index 6b82577032..000832aacd 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
index b8db871c35..85e1d39035 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/Source/.make_opts b/epochX/cudacpp/gg_tt.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/.make_opts
+++ b/epochX/cudacpp/gg_tt.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gg_tt.mad/Source/cuts.inc b/epochX/cudacpp/gg_tt.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/Source/makefile b/epochX/cudacpp/gg_tt.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/makefile
+++ b/epochX/cudacpp/gg_tt.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gg_tt.mad/Source/run_card.inc b/epochX/cudacpp/gg_tt.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index b32f4b931e..9a72b09e5a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -368,154 +424,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -555,7 +500,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -588,6 +537,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -628,6 +581,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -748,8 +705,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -757,25 +714,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -788,7 +761,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -798,26 +771,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -830,15 +804,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -875,33 +856,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -920,13 +1055,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -938,18 +1067,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -974,93 +1108,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1102,7 +1173,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1125,7 +1196,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1134,21 +1205,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1162,8 +1235,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1179,11 +1254,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1285,14 +1361,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index feff1cc6e1..5d952c7419 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
index bc9bcfeb9b..008afc92ae 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index db3c284caa..fc3ede89c4 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b68b9250fd
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc
@@ -0,0 +1,427 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc
index 99d3eecc56..0dbac30825 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc
@@ -24,3 +24,5 @@ C     Diagram 3
       DATA (SPROP(I,-2,3),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/3/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
index ec5722702a..30cca27587 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
index 707ea40323..8481c73d0f 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -227,17 +224,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -307,7 +293,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -350,7 +336,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -393,21 +380,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /5.333333333333333D+00,
-     $ -6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  2) /16,-4/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  2) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
+      DATA (CF(I),I=  3,  3) /16/
 C     1 T(2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -446,10 +436,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -458,6 +450,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/files.py b/epochX/cudacpp/gg_tt.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data b/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gg_tt.mad/bin/madevent b/epochX/cudacpp/gg_tt.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/madevent
+++ b/epochX/cudacpp/gg_tt.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index febf1dcf42..0561db9dc0 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
index d09f387480..4772e6dc1d 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
index ba434e7b98..41fb70a23e 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 816b17272d..021fefaea7 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +57,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006091594696044922 [0m
+[1;32mDEBUG: model prefixing  takes 0.004730224609375 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,45 +150,45 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.008 s
+1 processes with 3 diagrams generated in 0.007 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.144 s
+ALOHA: aloha creates 2 routines in  0.123 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 quit
 
-real	0m0.544s
-user	0m0.472s
-sys	0m0.060s
+real	0m0.525s
+user	0m0.459s
+sys	0m0.062s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_tt.sa/COPYRIGHT b/epochX/cudacpp/gg_tt.sa/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/gg_tt.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.sa/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
index 5c7a133eed..fe42002366 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -365,154 +421,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -552,7 +497,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -585,6 +534,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -625,6 +578,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -745,8 +702,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -754,25 +711,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -785,7 +758,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -795,26 +768,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -827,15 +801,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -872,33 +853,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -917,13 +1052,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -935,18 +1064,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -971,93 +1105,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1099,7 +1170,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1122,7 +1193,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1131,21 +1202,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1159,8 +1232,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1176,11 +1251,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1282,14 +1358,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
index feff1cc6e1..5d952c7419 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b68b9250fd
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc
@@ -0,0 +1,427 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fbridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/makefile_original.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
index febf1dcf42..0561db9dc0 100644
--- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
index d09f387480..4772e6dc1d 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
index ba434e7b98..41fb70a23e 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
index d3c4ca5695..7d34de72f8 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 6466d14e6d..74af92edcf 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +57,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00611424446105957 [0m
+[1;32mDEBUG: model prefixing  takes 0.005340576171875 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.008 s
+1 processes with 3 diagrams generated in 0.010 s
 Total: 1 processes with 3 diagrams
 add process g g > t t~ g
 INFO: Checking for minimal orders which gives processes. 
@@ -159,21 +158,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.019 s
+1 processes with 16 diagrams generated in 0.024 s
 Total: 2 processes with 19 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt01g 
 INFO: remove old information in CODEGEN_mad_gg_tt01g 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Processing color information for process: g g > t t~ g @2 
@@ -187,9 +186,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -198,25 +197,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s
-Wrote files for 46 helas calls in 0.184 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.050 s
+Wrote files for 46 helas calls in 0.272 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.322 s
+ALOHA: aloha creates 5 routines in  0.316 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.311 s
+ALOHA: aloha creates 10 routines in  0.352 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -226,41 +225,32 @@ ALOHA: aloha creates 10 routines in  0.311 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P2_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #2 succeeded at 243 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.583s
-user	0m2.278s
-sys	0m0.302s
-Code generation completed in 3 seconds
+real	0m3.896s
+user	0m3.101s
+sys	0m0.655s
+Code generation completed in 4 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -273,7 +263,7 @@ Code generation completed in 3 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -281,10 +271,9 @@ Code generation completed in 3 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -303,7 +292,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -311,10 +300,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
index e50becb2d9..8728eabc9c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
index 1711d30371..d4c7c73e61 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
@@ -125,6 +125,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
index 364dbd21b0..730a05e322 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
@@ -125,6 +125,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc b/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/makefile b/epochX/cudacpp/gg_tt01g.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/makefile
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc b/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc
index 2588190439..e169c1f193 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index b32f4b931e..9a72b09e5a 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -368,154 +424,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -555,7 +500,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -588,6 +537,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -628,6 +581,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -748,8 +705,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -757,25 +714,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -788,7 +761,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -798,26 +771,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -830,15 +804,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -875,33 +856,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -920,13 +1055,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -938,18 +1067,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -974,93 +1108,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1102,7 +1173,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1125,7 +1196,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1134,21 +1205,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1162,8 +1235,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1179,11 +1254,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1285,14 +1361,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index feff1cc6e1..5d952c7419 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
index bc9bcfeb9b..008afc92ae 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index db3c284caa..fc3ede89c4 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b68b9250fd
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc
@@ -0,0 +1,427 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/configs.inc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/configs.inc
index 99d3eecc56..0dbac30825 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/configs.inc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/configs.inc
@@ -24,3 +24,5 @@ C     Diagram 3
       DATA (SPROP(I,-2,3),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/3/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f
index ec5722702a..30cca27587 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fbridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/makefile_original.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
index 707ea40323..8481c73d0f 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -227,17 +224,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -307,7 +293,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -350,7 +336,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -393,21 +380,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /5.333333333333333D+00,
-     $ -6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  2) /16,-4/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  2) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
+      DATA (CF(I),I=  3,  3) /16/
 C     1 T(2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -446,10 +436,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -458,6 +450,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
index c8b3dbf03c..3519cda091 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -567,158 +623,43 @@ namespace mg5amcCpu
       jamp_sv[5] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_gg_ttxg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 64, -8, -8, 1, 1, 10 },
-        { -8, 64, 1, 10, -8, 1 },
-        { -8, 1, 64, -8, 10, 1 },
-        { 1, 10, -8, 64, 1, -8 },
-        { 1, -8, 10, 1, 64, -8 },
-        { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -774,7 +715,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -808,6 +753,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -849,6 +798,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -969,8 +922,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -978,25 +931,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1009,7 +978,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1019,26 +988,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1051,15 +1021,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1096,33 +1073,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1141,13 +1272,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1159,18 +1284,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1195,93 +1325,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1323,7 +1390,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1346,7 +1413,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1355,21 +1422,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1383,8 +1452,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1400,11 +1471,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1506,14 +1578,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
index b583fc85fe..d7ce5daa6c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
index 8843b88a23..ae729ed904 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
index b22dde0f92..aecfa311e2 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc
new file mode 100644
index 0000000000..9e3ce9d917
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc
@@ -0,0 +1,431 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 64, -8, -8, 1, 1, 10 },
+    { -8, 64, 1, 10, -8, 1 },
+    { -8, 1, 64, -8, 10, 1 },
+    { 1, 10, -8, 64, 1, -8 },
+    { 1, -8, 10, 1, 64, -8 },
+    { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/configs.inc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/configs.inc
index 1eb9c578f9..a3ad3e22cf 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/configs.inc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/configs.inc
@@ -171,3 +171,5 @@ C     Diagram 15
       DATA (SPROP(I,-3,15),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/15/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f
index c2eadb2c31..aa93a3d195 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fbridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/makefile_original.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
index 7d44ae130e..6662900421 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -243,17 +240,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -323,7 +309,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -366,7 +352,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(9)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -409,43 +396,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /7.111111111111111D+00,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D
-     $ +00/
+      DATA DENOM/9/
+      DATA (CF(I),I=  1,  6) /64,-16,-16,2,2,20/
 C     1 T(1,2,5,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D
-     $ +00,-8.888888888888888D-01,1.111111111111111D-01/
+      DATA (CF(I),I=  7, 11) /64,2,20,-16,2/
 C     1 T(1,5,2,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-8.888888888888888D-01
-     $ ,1.111111111111111D-01,7.111111111111111D+00,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01/
+      DATA (CF(I),I= 12, 15) /64,-16,20,2/
 C     1 T(2,1,5,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.111111111111111D-01
-     $ ,1.111111111111111D+00,-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,
-     $ -8.888888888888888D-01/
+      DATA (CF(I),I= 16, 18) /64,2,-16/
 C     1 T(2,5,1,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.111111111111111D-01,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01,7.111111111111111D+00,-8.888888888888888D-01/
+      DATA (CF(I),I= 19, 20) /64,-16/
 C     1 T(5,1,2,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.111111111111111D+00
-     $ ,1.111111111111111D-01,1.111111111111111D-01,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,7.111111111111111D+00/
+      DATA (CF(I),I= 21, 21) /64/
 C     1 T(5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -549,10 +525,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -561,6 +539,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data b/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/madevent b/epochX/cudacpp/gg_tt01g.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/madevent
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
index ff9f0d7f00..a18c3a4ea2 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
index 47a3a011b8..a5e188e4f8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
index 76066c7bb1..24e0e80f84 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index c216de0edd..2247620ea0 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +57,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006003379821777344 [0m
+[1;32mDEBUG: model prefixing  takes 0.004524707794189453 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,21 +150,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.021 s
+1 processes with 16 diagrams generated in 0.018 s
 Total: 1 processes with 16 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
@@ -177,25 +176,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
-Wrote files for 36 helas calls in 0.123 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.045 s
+Wrote files for 36 helas calls in 0.142 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.331 s
+ALOHA: aloha creates 5 routines in  0.245 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.310 s
+ALOHA: aloha creates 10 routines in  0.234 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -205,38 +204,32 @@ ALOHA: aloha creates 10 routines in  0.310 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #2 succeeded at 243 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.439s
-user	0m2.135s
-sys	0m0.297s
-Code generation completed in 2 seconds
+real	0m2.613s
+user	0m2.182s
+sys	0m0.417s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -249,7 +242,7 @@ Code generation completed in 2 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -257,10 +250,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -279,7 +271,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -287,10 +279,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
index 3ace6e558c..cd6d16fc93 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
index d087670827..a16ea5dee6 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
index 43e93cbf40..cdcd77f36d 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttg.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/.make_opts
+++ b/epochX/cudacpp/gg_ttg.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/makefile b/epochX/cudacpp/gg_ttg.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/makefile
+++ b/epochX/cudacpp/gg_ttg.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index 5de1c626c8..037b031386 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -567,158 +623,43 @@ namespace mg5amcCpu
       jamp_sv[5] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 64, -8, -8, 1, 1, 10 },
-        { -8, 64, 1, 10, -8, 1 },
-        { -8, 1, 64, -8, 10, 1 },
-        { 1, 10, -8, 64, 1, -8 },
-        { 1, -8, 10, 1, 64, -8 },
-        { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -774,7 +715,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -808,6 +753,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -849,6 +798,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -969,8 +922,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -978,25 +931,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1009,7 +978,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1019,26 +988,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1051,15 +1021,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1096,33 +1073,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1141,13 +1272,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1159,18 +1284,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1195,93 +1325,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1323,7 +1390,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1346,7 +1413,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1355,21 +1422,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1383,8 +1452,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1400,11 +1471,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1506,14 +1578,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 2acfa000a7..69d8ea8b08 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
index 10496aa04d..19937ed005 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
index 7c8695090c..9e5f9c9b0a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc
new file mode 100644
index 0000000000..9e3ce9d917
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc
@@ -0,0 +1,431 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 64, -8, -8, 1, 1, 10 },
+    { -8, 64, 1, 10, -8, 1 },
+    { -8, 1, 64, -8, 10, 1 },
+    { 1, 10, -8, 64, 1, -8 },
+    { 1, -8, 10, 1, 64, -8 },
+    { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/configs.inc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/configs.inc
index 1eb9c578f9..a3ad3e22cf 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/configs.inc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/configs.inc
@@ -171,3 +171,5 @@ C     Diagram 15
       DATA (SPROP(I,-3,15),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/15/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f
index c2eadb2c31..aa93a3d195 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fbridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
index 797b19405d..48e24320cc 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -243,17 +240,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -323,7 +309,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -366,7 +352,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(9)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -409,43 +396,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /7.111111111111111D+00,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D
-     $ +00/
+      DATA DENOM/9/
+      DATA (CF(I),I=  1,  6) /64,-16,-16,2,2,20/
 C     1 T(1,2,5,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D
-     $ +00,-8.888888888888888D-01,1.111111111111111D-01/
+      DATA (CF(I),I=  7, 11) /64,2,20,-16,2/
 C     1 T(1,5,2,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-8.888888888888888D-01
-     $ ,1.111111111111111D-01,7.111111111111111D+00,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01/
+      DATA (CF(I),I= 12, 15) /64,-16,20,2/
 C     1 T(2,1,5,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.111111111111111D-01
-     $ ,1.111111111111111D+00,-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,
-     $ -8.888888888888888D-01/
+      DATA (CF(I),I= 16, 18) /64,2,-16/
 C     1 T(2,5,1,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.111111111111111D-01,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01,7.111111111111111D+00,-8.888888888888888D-01/
+      DATA (CF(I),I= 19, 20) /64,-16/
 C     1 T(5,1,2,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.111111111111111D+00
-     $ ,1.111111111111111D-01,1.111111111111111D-01,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,7.111111111111111D+00/
+      DATA (CF(I),I= 21, 21) /64/
 C     1 T(5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -549,10 +525,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -561,6 +539,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/madevent b/epochX/cudacpp/gg_ttg.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/madevent
+++ b/epochX/cudacpp/gg_ttg.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
index ff9f0d7f00..a18c3a4ea2 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
index 47a3a011b8..a5e188e4f8 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
index 76066c7bb1..24e0e80f84 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 882c93c3a5..9c2ae753b6 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +57,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006311178207397461 [0m
+[1;32mDEBUG: model prefixing  takes 0.0047342777252197266 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,33 +150,33 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.022 s
+1 processes with 16 diagrams generated in 0.018 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.034 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.323 s
+ALOHA: aloha creates 5 routines in  0.247 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -187,17 +186,17 @@ ALOHA: aloha creates 5 routines in  0.323 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 quit
 
-real	0m0.791s
-user	0m0.727s
-sys	0m0.049s
+real	0m0.729s
+user	0m0.664s
+sys	0m0.058s
 Code generation completed in 0 seconds
diff --git a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
index bf77ac9970..896544668f 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -561,158 +617,43 @@ namespace mg5amcCpu
       jamp_sv[5] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 64, -8, -8, 1, 1, 10 },
-        { -8, 64, 1, 10, -8, 1 },
-        { -8, 1, 64, -8, 10, 1 },
-        { 1, 10, -8, 64, 1, -8 },
-        { 1, -8, 10, 1, 64, -8 },
-        { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -768,7 +709,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -802,6 +747,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -843,6 +792,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -963,8 +916,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -972,25 +925,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1003,7 +972,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1013,26 +982,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1045,15 +1015,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1090,33 +1067,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1135,13 +1266,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1153,18 +1278,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1189,93 +1319,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1317,7 +1384,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1340,7 +1407,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1349,21 +1416,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1377,8 +1446,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1394,11 +1465,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1500,14 +1572,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
index 2acfa000a7..69d8ea8b08 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc
new file mode 100644
index 0000000000..9e3ce9d917
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc
@@ -0,0 +1,431 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 64, -8, -8, 1, 1, 10 },
+    { -8, 64, 1, 10, -8, 1 },
+    { -8, 1, 64, -8, 10, 1 },
+    { 1, 10, -8, 64, 1, -8 },
+    { 1, -8, 10, 1, 64, -8 },
+    { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fbridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/makefile_original.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
index ff9f0d7f00..a18c3a4ea2 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
index 47a3a011b8..a5e188e4f8 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
index 76066c7bb1..24e0e80f84 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
index d3c4ca5695..7d34de72f8 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 78cdfd68b2..6e14919193 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +57,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006547212600708008 [0m
+[1;32mDEBUG: model prefixing  takes 0.008086442947387695 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,21 +150,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.171 s
+1 processes with 123 diagrams generated in 0.250 s
 Total: 1 processes with 123 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
@@ -177,25 +176,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s
-Wrote files for 222 helas calls in 0.660 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.835 s
+Wrote files for 222 helas calls in 1.049 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.328 s
+ALOHA: aloha creates 5 routines in  0.627 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.313 s
+ALOHA: aloha creates 10 routines in  0.561 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -208,38 +207,32 @@ ALOHA: aloha creates 10 routines in  0.313 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #2 succeeded at 275 (offset 48 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.934s
-user	0m3.516s
-sys	0m0.277s
-Code generation completed in 5 seconds
+real	0m6.937s
+user	0m6.044s
+sys	0m0.744s
+Code generation completed in 7 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -252,7 +245,7 @@ Code generation completed in 5 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -260,10 +253,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -282,7 +274,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -290,10 +282,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
index 1fa5e235b3..f27fba5d1a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
index ecdc7fd25c..964b954d74 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
index 7ec841d6c2..308f5bed4f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/makefile b/epochX/cudacpp/gg_ttgg.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/makefile
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index c508e73f26..4272326385 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 24;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -2461,176 +2517,43 @@ namespace mg5amcCpu
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxgg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
-
-      // The color matrix (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
-        { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
-        { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
-        { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
-        { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
-        { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
-        { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
-        { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
-        { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
-        { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
-        { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
-        { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
-        { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
-        { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
-        { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
-        { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
-        { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
-        { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
-        { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
-        { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
-        { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
-        { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
-        { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
-        { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -2718,7 +2641,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -2753,6 +2680,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -2795,6 +2726,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -2915,8 +2850,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -2924,25 +2859,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -2955,7 +2906,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -2965,26 +2916,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -2997,15 +2949,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -3042,33 +3001,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -3087,13 +3200,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -3105,18 +3212,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -3141,93 +3253,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -3269,7 +3318,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -3292,7 +3341,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -3301,21 +3350,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -3329,8 +3380,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -3346,11 +3399,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -3452,14 +3506,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
index 2b75e0f842..05c6aedfb3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
index c087f3f747..347686d1e9 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
index ce5493be9b..7e58e4577f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc
new file mode 100644
index 0000000000..91a7f9998e
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc
@@ -0,0 +1,449 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
+
+  // The color matrix (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
+    { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
+    { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
+    { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
+    { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
+    { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
+    { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
+    { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
+    { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
+    { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
+    { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
+    { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
+    { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
+    { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
+    { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
+    { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
+    { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
+    { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
+    { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
+    { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
+    { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
+    { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
+    { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
+    { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/configs.inc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/configs.inc
index b50d3d5335..570419b5c0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/configs.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/configs.inc
@@ -1530,3 +1530,5 @@ C     Diagram 105
       DATA (SPROP(I,-4,105),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/105/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fbridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/makefile_original.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
index 3ea53d8b21..5a966d34d4 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -275,17 +272,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -355,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -398,7 +384,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(155)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -441,407 +428,81 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I,  1),I=  7, 12) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  1),I= 13, 18) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  1),I= 19, 24) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
+      DATA DENOM/54/
+      DATA (CF(I),I=  1, 24) /512,-128,-128,16,16,160,-128,16,16,-2,-2
+     $ ,-20,16,-2,160,-20,142,124,-2,-20,-20,124,124,-56/
 C     1 T(1,2,5,6,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I,  2),I=  7, 12) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  2),I= 13, 18) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I,  2),I= 19, 24) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
+      DATA (CF(I),I= 25, 47) /512,16,160,-128,16,16,-128,-2,-20,16,-2,
+     $ -2,-20,-20,124,124,-56,16,-2,160,-20,142,124/
 C     1 T(1,2,6,5,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I,  3),I=  7, 12) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  3),I= 13, 18) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  3),I= 19, 24) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
+      DATA (CF(I),I= 48, 69) /512,-128,160,16,16,-2,160,-20,142,124,
+     $ -128,16,16,-2,-2,-20,-20,-2,124,-56,-20,124/
 C     1 T(1,5,2,6,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I,  4),I=  7, 12) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I,  4),I= 13, 18) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  4),I= 19, 24) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
+      DATA (CF(I),I= 70, 90) /512,16,-128,-2,-20,-20,124,124,-56,16,
+     $ -128,-2,-20,16,-2,-2,16,142,124,160,-20/
 C     1 T(1,5,6,2,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I,  5),I=  7, 12) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I,  5),I= 13, 18) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  5),I= 19, 24) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I= 91,110) /512,-128,-2,16,142,124,160,-20,-20,-2
+     $ ,124,-56,-20,124,-128,16,16,-2,-2,-20/
 C     1 T(1,6,2,5,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I,  6),I=  7, 12) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  6),I= 13, 18) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I,  6),I= 19, 24) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=111,129) /512,-20,-2,124,-56,-20,124,-2,16,142,124
+     $ ,160,-20,16,-128,-2,-20,16,-2/
 C     1 T(1,6,5,2,3,4)
-      DATA (CF(I,  7),I=  1,  6) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  7),I=  7, 12) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I,  7),I= 13, 18) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I,  7),I= 19, 24) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
+      DATA (CF(I),I=130,147) /512,-128,-128,16,16,160,160,-20,16,-2
+     $ ,124,142,-20,124,-2,-20,-56,124/
 C     1 T(2,1,5,6,3,4)
-      DATA (CF(I,  8),I=  1,  6) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  8),I=  7, 12) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I,  8),I= 13, 18) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  8),I= 19, 24) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
+      DATA (CF(I),I=148,164) /512,16,160,-128,16,-20,124,-2,-20,-56
+     $ ,124,160,-20,16,-2,124,142/
 C     1 T(2,1,6,5,3,4)
-      DATA (CF(I,  9),I=  1,  6) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  9),I=  7, 12) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I,  9),I= 13, 18) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  9),I= 19, 24) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I=165,180) /512,-128,160,16,16,-2,-128,16,-20,-2,124
+     $ ,-56,-20,-2,124,-20/
 C     1 T(2,5,1,6,3,4)
-      DATA (CF(I, 10),I=  1,  6) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I, 10),I=  7, 12) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 10),I= 13, 18) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 10),I= 19, 24) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
+      DATA (CF(I),I=181,195) /512,16,-128,-2,-20,16,-128,-2,16,142,124
+     $ ,-2,16,-20,160/
 C     1 T(2,5,6,1,3,4)
-      DATA (CF(I, 11),I=  1,  6) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I, 11),I=  7, 12) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I, 11),I= 13, 18) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 11),I= 19, 24) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=196,209) /512,-128,124,-56,-20,-2,124,-20,16,-2,
+     $ -128,16,-20,-2/
 C     1 T(2,6,1,5,3,4)
-      DATA (CF(I, 12),I=  1,  6) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 12),I=  7, 12) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I, 12),I= 13, 18) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 12),I= 19, 24) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=210,222) /512,142,124,-2,16,-20,160,-2,-20,16,-128
+     $ ,-2,16/
 C     1 T(2,6,5,1,3,4)
-      DATA (CF(I, 13),I=  1,  6) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 13),I=  7, 12) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I, 13),I= 13, 18) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I, 13),I= 19, 24) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I=223,234) /512,-128,-128,16,16,160,124,-20,-56,124,
+     $ -2,-20/
 C     1 T(5,1,2,6,3,4)
-      DATA (CF(I, 14),I=  1,  6) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 14),I=  7, 12) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 14),I= 13, 18) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I, 14),I= 19, 24) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
+      DATA (CF(I),I=235,245) /512,16,160,-128,16,-20,160,124,142,16,-2/
 C     1 T(5,1,6,2,3,4)
-      DATA (CF(I, 15),I=  1,  6) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I, 15),I=  7, 12) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 15),I= 13, 18) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I, 15),I= 19, 24) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=246,255) /512,-128,160,16,-56,124,124,-20,-20,-2/
 C     1 T(5,2,1,6,3,4)
-      DATA (CF(I, 16),I=  1,  6) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 16),I=  7, 12) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 16),I= 13, 18) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 16),I= 19, 24) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=256,264) /512,16,-128,124,142,-20,160,-2,16/
 C     1 T(5,2,6,1,3,4)
-      DATA (CF(I, 17),I=  1,  6) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 17),I=  7, 12) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 17),I= 13, 18) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I, 17),I= 19, 24) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=265,272) /512,-128,-2,16,-20,-2,-128,16/
 C     1 T(5,6,1,2,3,4)
-      DATA (CF(I, 18),I=  1,  6) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 18),I=  7, 12) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 18),I= 13, 18) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I, 18),I= 19, 24) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
+      DATA (CF(I),I=273,279) /512,-20,-2,-2,16,16,-128/
 C     1 T(5,6,2,1,3,4)
-      DATA (CF(I, 19),I=  1,  6) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 19),I=  7, 12) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 19),I= 13, 18) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 19),I= 19, 24) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
+      DATA (CF(I),I=280,285) /512,-128,-128,16,16,160/
 C     1 T(6,1,2,5,3,4)
-      DATA (CF(I, 20),I=  1,  6) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 20),I=  7, 12) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 20),I= 13, 18) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 20),I= 19, 24) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
+      DATA (CF(I),I=286,290) /512,16,160,-128,16/
 C     1 T(6,1,5,2,3,4)
-      DATA (CF(I, 21),I=  1,  6) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 21),I=  7, 12) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 21),I= 13, 18) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 21),I= 19, 24) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
+      DATA (CF(I),I=291,294) /512,-128,160,16/
 C     1 T(6,2,1,5,3,4)
-      DATA (CF(I, 22),I=  1,  6) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 22),I=  7, 12) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 22),I= 13, 18) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 22),I= 19, 24) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
+      DATA (CF(I),I=295,297) /512,16,-128/
 C     1 T(6,2,5,1,3,4)
-      DATA (CF(I, 23),I=  1,  6) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 23),I=  7, 12) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 23),I= 13, 18) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 23),I= 19, 24) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
+      DATA (CF(I),I=298,299) /512,-128/
 C     1 T(6,5,1,2,3,4)
-      DATA (CF(I, 24),I=  1,  6) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 24),I=  7, 12) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 24),I= 13, 18) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 24),I= 19, 24) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
+      DATA (CF(I),I=300,300) /512/
 C     1 T(6,5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -1547,10 +1208,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -1559,6 +1222,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(2)=AMP2(2)+AMP(4)*DCONJG(AMP(4))
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/madevent b/epochX/cudacpp/gg_ttgg.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/madevent
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
index 53dd560ed6..da11e740d9 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
index 47a3a011b8..a5e188e4f8 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
index 76066c7bb1..24e0e80f84 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 7e5a3007eb..04760e59cb 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +57,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0059185028076171875 [0m
+[1;32mDEBUG: model prefixing  takes 0.005910158157348633 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,33 +150,33 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.156 s
+1 processes with 123 diagrams generated in 0.162 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.422 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.359 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.316 s
+ALOHA: aloha creates 5 routines in  0.261 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -190,17 +189,17 @@ ALOHA: aloha creates 5 routines in  0.316 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 quit
 
-real	0m1.455s
-user	0m1.362s
-sys	0m0.060s
-Code generation completed in 1 seconds
+real	0m1.337s
+user	0m1.238s
+sys	0m0.074s
+Code generation completed in 2 seconds
diff --git a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
index 5956559974..d50b7efcec 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 24;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -2518,176 +2574,43 @@ namespace mg5amcCpu
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxgg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
-
-      // The color matrix (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
-        { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
-        { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
-        { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
-        { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
-        { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
-        { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
-        { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
-        { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
-        { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
-        { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
-        { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
-        { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
-        { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
-        { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
-        { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
-        { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
-        { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
-        { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
-        { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
-        { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
-        { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
-        { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
-        { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -2775,7 +2698,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -2810,6 +2737,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -2852,6 +2783,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -2972,8 +2907,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -2981,25 +2916,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -3012,7 +2963,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -3022,26 +2973,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -3054,15 +3006,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -3099,33 +3058,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -3144,13 +3257,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -3162,18 +3269,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -3198,93 +3310,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -3326,7 +3375,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -3349,7 +3398,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -3358,21 +3407,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -3386,8 +3437,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -3403,11 +3456,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -3509,14 +3563,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
index 2b75e0f842..05c6aedfb3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc
new file mode 100644
index 0000000000..91a7f9998e
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc
@@ -0,0 +1,449 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
+
+  // The color matrix (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
+    { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
+    { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
+    { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
+    { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
+    { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
+    { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
+    { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
+    { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
+    { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
+    { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
+    { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
+    { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
+    { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
+    { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
+    { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
+    { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
+    { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
+    { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
+    { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
+    { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
+    { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
+    { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
+    { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fbridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/makefile_original.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
index 53dd560ed6..da11e740d9 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
index 47a3a011b8..a5e188e4f8 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
index 76066c7bb1..24e0e80f84 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
index d3c4ca5695..7d34de72f8 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 1afa1ab2a5..bdc543f19d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +57,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005990028381347656 [0m
+[1;32mDEBUG: model prefixing  takes 0.004824638366699219 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,27 +150,27 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.890 s
+1 processes with 1240 diagrams generated in 1.735 s
 Total: 1 processes with 1240 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
+INFO: Color-Flow passed to 1630 term in 6s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
@@ -179,25 +178,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 945 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.565 s
-Wrote files for 2281 helas calls in 18.614 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 945 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 1 subprocesses (1240 diagrams) in 5.829 s
+Wrote files for 2281 helas calls in 15.156 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.373 s
+ALOHA: aloha creates 5 routines in  0.258 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.313 s
+ALOHA: aloha creates 10 routines in  0.293 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -210,38 +209,32 @@ ALOHA: aloha creates 10 routines in  0.313 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #2 succeeded at 339 (offset 112 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m33.065s
-user	0m32.263s
-sys	0m0.459s
-Code generation completed in 33 seconds
+real	0m28.853s
+user	0m28.097s
+sys	0m0.568s
+Code generation completed in 29 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -254,7 +247,7 @@ Code generation completed in 33 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -262,10 +255,9 @@ Code generation completed in 33 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -284,7 +276,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -292,10 +284,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
index cdd9d43b05..0125eda85b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
index a08f93d92b..596243d42e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
index 48050a5fd7..377d5bc1c7 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/makefile b/epochX/cudacpp/gg_ttggg.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/makefile
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
index ba06f6ff44..0548d00f74 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 120;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -29966,272 +30022,43 @@ namespace mg5amcCpu
       jamp_sv[116] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxggg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=120)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120]
-
-      // The color matrix (initialize all array elements, with ncolor=120)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 },
-        { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 },
-        { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 },
-        { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 },
-        { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 },
-        { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 },
-        { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 },
-        { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 },
-        { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 },
-        { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 },
-        { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 },
-        { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 },
-        { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 },
-        { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 },
-        { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 },
-        { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 },
-        { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 },
-        { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 },
-        { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 },
-        { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 },
-        { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 },
-        { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 },
-        { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 },
-        { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 },
-        { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 },
-        { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 },
-        { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 },
-        { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 },
-        { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 },
-        { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 },
-        { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 },
-        { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 },
-        { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 },
-        { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 },
-        { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 },
-        { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 },
-        { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 },
-        { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 },
-        { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 },
-        { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 },
-        { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 },
-        { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 },
-        { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 },
-        { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 },
-        { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 },
-        { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 },
-        { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 },
-        { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 },
-        { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 },
-        { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 },
-        { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 },
-        { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 },
-        { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 },
-        { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 },
-        { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 },
-        { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 },
-        { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 },
-        { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 },
-        { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 },
-        { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 },
-        { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 },
-        { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 },
-        { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 },
-        { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 },
-        { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 },
-        { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 },
-        { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 },
-        { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 },
-        { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 },
-        { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 },
-        { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 },
-        { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 },
-        { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 },
-        { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 },
-        { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 },
-        { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 },
-        { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 },
-        { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 },
-        { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 },
-        { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 },
-        { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 },
-        { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 },
-        { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 },
-        { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 },
-        { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 },
-        { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 },
-        { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 },
-        { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 },
-        { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 },
-        { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 },
-        { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 },
-        { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 },
-        { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 },
-        { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 },
-        { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 },
-        { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 },
-        { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 },
-        { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 },
-        { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 },
-        { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 },
-        { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 },
-        { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 },
-        { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 },
-        { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 },
-        { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 },
-        { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 },
-        { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 },
-        { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 },
-        { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 },
-        { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 },
-        { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 },
-        { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 },
-        { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 },
-        { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 },
-        { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 },
-        { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 },
-        { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 },
-        { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 },
-        { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
-        { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -30383,7 +30210,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -30419,6 +30250,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -30462,6 +30297,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -30582,8 +30421,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -30591,25 +30430,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -30622,7 +30477,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -30632,26 +30487,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -30664,15 +30520,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -30709,33 +30572,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -30754,13 +30771,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -30772,18 +30783,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -30808,93 +30824,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -30936,7 +30889,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -30959,7 +30912,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -30968,21 +30921,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -30996,8 +30951,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -31013,11 +30970,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -31119,14 +31077,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
index 2eb1e066ff..f20243637a 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 128; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 1240; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 120; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
index 523ef1948b..e0c6371008 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
index 3152176aa0..3e9140b741 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc
new file mode 100644
index 0000000000..dea7f9fdb2
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc
@@ -0,0 +1,545 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=120)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120]
+
+  // The color matrix (initialize all array elements, with ncolor=120)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 },
+    { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 },
+    { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 },
+    { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 },
+    { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 },
+    { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 },
+    { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 },
+    { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 },
+    { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 },
+    { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 },
+    { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 },
+    { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 },
+    { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 },
+    { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 },
+    { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 },
+    { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 },
+    { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 },
+    { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 },
+    { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 },
+    { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 },
+    { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 },
+    { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 },
+    { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 },
+    { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 },
+    { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 },
+    { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 },
+    { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 },
+    { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 },
+    { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 },
+    { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 },
+    { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 },
+    { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 },
+    { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 },
+    { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 },
+    { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 },
+    { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 },
+    { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 },
+    { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 },
+    { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 },
+    { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 },
+    { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 },
+    { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 },
+    { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 },
+    { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 },
+    { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 },
+    { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 },
+    { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 },
+    { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 },
+    { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 },
+    { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 },
+    { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 },
+    { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 },
+    { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 },
+    { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 },
+    { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 },
+    { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 },
+    { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 },
+    { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 },
+    { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 },
+    { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 },
+    { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 },
+    { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 },
+    { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 },
+    { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 },
+    { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 },
+    { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 },
+    { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 },
+    { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 },
+    { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 },
+    { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 },
+    { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 },
+    { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 },
+    { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 },
+    { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 },
+    { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 },
+    { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 },
+    { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 },
+    { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 },
+    { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 },
+    { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 },
+    { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 },
+    { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 },
+    { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 },
+    { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 },
+    { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 },
+    { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 },
+    { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 },
+    { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 },
+    { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 },
+    { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 },
+    { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 },
+    { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 },
+    { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 },
+    { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 },
+    { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 },
+    { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 },
+    { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 },
+    { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 },
+    { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 },
+    { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 },
+    { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 },
+    { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 },
+    { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 },
+    { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 },
+    { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 },
+    { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 },
+    { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 },
+    { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 },
+    { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 },
+    { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 },
+    { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 },
+    { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 },
+    { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 },
+    { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 },
+    { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 },
+    { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 },
+    { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 },
+    { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 },
+    { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
+    { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/configs.inc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/configs.inc
index cd0b177907..5d7030cc05 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/configs.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/configs.inc
@@ -16695,3 +16695,5 @@ C     Diagram 945
       DATA (SPROP(I,-5,945),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/945/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f
index 3671cdce55..c559e01778 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fbridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/makefile_original.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
index 07ccd4d1a4..72956c33dc 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -339,17 +336,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -419,7 +405,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -462,7 +448,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(3030)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -505,9375 +492,738 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I,  1),I=  7, 12) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  1),I= 13, 18) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I,  1),I= 19, 24) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I,  1),I= 25, 30) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  1),I= 31, 36) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  1),I= 37, 42) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  1),I= 43, 48) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  1),I= 49, 54) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  1),I= 55, 60) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,  1),I= 61, 66) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  1),I= 67, 72) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  1),I= 73, 78) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  1),I= 79, 84) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  1),I= 85, 90) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  1),I= 91, 96) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  1),I= 97,102) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  1),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  1),I=109,114) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  1),I=115,120) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
+      DATA DENOM/324/
+      DATA (CF(I),I=  1,120) /4096,-1024,-1024,128,128,1280,-1024,128
+     $ ,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160,992
+     $ ,992,-448,-1024,128,128,-16,-16,-160,128,-16,-16,2,2,20,-16,2,
+     $ -160,20,-142,-124,2,20,20,-124,-124,56,128,-16,-16,2,2,20,1280,
+     $ -160,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38,-124,
+     $ -106,-268,-88,-16,2,-160,20,-142,-124,-160,20,992,-124,38,-106
+     $ ,992,-124,-448,56,-268,-88,1010,-268,-268,884,884,-232,2,20,20,
+     $ -124,-124,56,20,200,-124,1028,-106,-88,-124,-106,56,-88,884,
+     $ -232,1028,-88,-88,-232,-232,272/
 C     1 T(1,2,5,6,7,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I,  2),I=  7, 12) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  2),I= 13, 18) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I,  2),I= 19, 24) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I,  2),I= 25, 30) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  2),I= 31, 36) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  2),I= 37, 42) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  2),I= 43, 48) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  2),I= 49, 54) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  2),I= 55, 60) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I,  2),I= 61, 66) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  2),I= 67, 72) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  2),I= 73, 78) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  2),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  2),I= 85, 90) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  2),I= 91, 96) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  2),I= 97,102) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  2),I=103,108) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  2),I=109,114) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  2),I=115,120) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
+      DATA (CF(I),I=121,239) /4096,128,1280,-1024,128,128,-1024,-16,
+     $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136
+     $ ,992,128,-1024,-16,-160,128,-16,-16,128,2,20,-16,2,2,20,20,-124
+     $ ,-124,56,-16,2,-160,20,-142,-124,-16,128,2,20,-16,2,-160,1280
+     $ ,20,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142,992,-124
+     $ ,1010,1028,2,20,20,-124,-124,56,20,200,-124,1028,-106,-88,-124,
+     $ -106,56,-88,884,-232,1028,-88,-88,-232,-232,272,-16,2,-160,20,
+     $ -142,-124,-160,20,992,-124,38,-106,992,-124,-448,56,-268,-88
+     $ ,1010,-268,-268,884,884,-232/
 C     1 T(1,2,5,7,6,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I,  3),I=  7, 12) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I,  3),I= 13, 18) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  3),I= 19, 24) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,  3),I= 25, 30) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  3),I= 31, 36) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  3),I= 37, 42) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  3),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  3),I= 49, 54) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  3),I= 55, 60) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  3),I= 61, 66) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  3),I= 67, 72) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  3),I= 73, 78) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  3),I= 79, 84) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,  3),I= 85, 90) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  3),I= 91, 96) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  3),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  3),I=103,108) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,  3),I=109,114) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  3),I=115,120) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
+      DATA (CF(I),I=240,357) /4096,-1024,1280,128,128,-16,1280,-160
+     $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992
+     $ ,128,-16,-1024,128,-160,-16,-16,2,-160,20,-142,-124,128,-16,-16
+     $ ,2,2,20,20,2,-124,56,20,-124,-16,2,-160,20,-142,-124,-160,20
+     $ ,992,-124,38,-106,992,-124,-448,56,-268,-88,1010,-268,-268,884
+     $ ,884,-232,128,-16,-16,2,2,20,1280,-160,-160,20,20,200,1136,-142
+     $ ,992,-124,1010,1028,-142,38,-124,-106,-268,-88,20,2,-124,56,20,
+     $ -124,200,20,-106,-88,-124,1028,1028,-88,-88,-232,-232,272,-124,
+     $ -106,56,-88,884,-232/
 C     1 T(1,2,6,5,7,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,  4),I=  7, 12) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I,  4),I= 13, 18) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  4),I= 19, 24) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,  4),I= 25, 30) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,  4),I= 31, 36) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  4),I= 37, 42) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  4),I= 43, 48) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  4),I= 49, 54) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  4),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  4),I= 61, 66) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  4),I= 67, 72) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  4),I= 73, 78) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  4),I= 79, 84) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I,  4),I= 85, 90) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  4),I= 91, 96) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  4),I= 97,102) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  4),I=103,108) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  4),I=109,114) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  4),I=115,120) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=358,474) /4096,128,-1024,-16,-160,-160,992,992,
+     $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,-16,
+     $ -160,128,-1024,-16,128,2,20,20,-124,-124,56,-16,128,2,20,-16,2
+     $ ,2,-16,-142,-124,-160,20,2,20,20,-124,-124,56,20,200,-124,1028,
+     $ -106,-88,-124,-106,56,-88,884,-232,1028,-88,-88,-232,-232,272,
+     $ -16,128,2,20,-16,2,-160,1280,20,200,-160,20,-142,38,-124,-106,
+     $ -268,-88,1136,-142,992,-124,1010,1028,2,-16,-142,-124,-160,20
+     $ ,20,-160,38,-106,992,-124,1010,-268,-268,884,884,-232,992,-124,
+     $ -448,56,-268,-88/
 C     1 T(1,2,6,7,5,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I,  5),I=  7, 12) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,  5),I= 13, 18) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,  5),I= 19, 24) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  5),I= 25, 30) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,  5),I= 31, 36) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  5),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  5),I= 43, 48) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  5),I= 49, 54) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  5),I= 55, 60) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  5),I= 61, 66) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  5),I= 67, 72) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  5),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  5),I= 79, 84) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,  5),I= 85, 90) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  5),I= 91, 96) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  5),I= 97,102) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  5),I=103,108) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,  5),I=109,114) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  5),I=115,120) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=475,590) /4096,-1024,-16,128,1136,992,1280,-160,
+     $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,-16,128,
+     $ -160,-16,-1024,128,2,-16,-142,-124,-160,20,20,2,-124,56,20,-124
+     $ ,128,-16,-16,2,2,20,2,-16,-142,-124,-160,20,20,-160,38,-106,992
+     $ ,-124,1010,-268,-268,884,884,-232,992,-124,-448,56,-268,-88,20
+     $ ,2,-124,56,20,-124,200,20,-106,-88,-124,1028,1028,-88,-88,-232,
+     $ -232,272,-124,-106,56,-88,884,-232,128,-16,-16,2,2,20,1280,-160
+     $ ,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38,-124,-106,
+     $ -268,-88/
 C     1 T(1,2,7,5,6,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I,  6),I=  7, 12) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,  6),I= 13, 18) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,  6),I= 19, 24) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  6),I= 25, 30) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,  6),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  6),I= 37, 42) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  6),I= 43, 48) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  6),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  6),I= 55, 60) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,  6),I= 61, 66) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  6),I= 67, 72) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  6),I= 73, 78) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  6),I= 79, 84) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  6),I= 85, 90) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  6),I= 91, 96) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  6),I= 97,102) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  6),I=103,108) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I,  6),I=109,114) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  6),I=115,120) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
+      DATA (CF(I),I=591,705) /4096,-160,-16,992,-448,-160,992,-16,128
+     $ ,1136,992,1280,-160,128,-1024,-16,-160,128,-16,-160,-16,-16,128
+     $ ,128,-1024,20,2,-124,56,20,-124,2,-16,-142,-124,-160,20,-16,128
+     $ ,2,20,-16,2,20,2,-124,56,20,-124,200,20,-106,-88,-124,1028,1028
+     $ ,-88,-88,-232,-232,272,-124,-106,56,-88,884,-232,2,-16,-142,
+     $ -124,-160,20,20,-160,38,-106,992,-124,1010,-268,-268,884,884,
+     $ -232,992,-124,-448,56,-268,-88,-16,128,2,20,-16,2,-160,1280,20
+     $ ,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142,992,-124,1010
+     $ ,1028/
 C     1 T(1,2,7,6,5,3,4)
-      DATA (CF(I,  7),I=  1,  6) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  7),I=  7, 12) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I,  7),I= 13, 18) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I,  7),I= 19, 24) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,  7),I= 25, 30) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  7),I= 31, 36) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,  7),I= 37, 42) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  7),I= 43, 48) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  7),I= 49, 54) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  7),I= 55, 60) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  7),I= 61, 66) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  7),I= 67, 72) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  7),I= 73, 78) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,  7),I= 79, 84) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  7),I= 85, 90) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  7),I= 91, 96) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I,  7),I= 97,102) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,  7),I=103,108) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  7),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  7),I=115,120) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
+      DATA (CF(I),I=706,819) /4096,-1024,-1024,128,128,1280,1280,-160
+     $ ,128,-16,992,1136,-160,992,-16,-160,-448,992,128,-16,-16,2,2,20
+     $ ,1280,-160,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38,
+     $ -124,-106,-268,-88,-1024,128,128,-16,-16,-160,128,-16,-16,2,2
+     $ ,20,-16,2,-160,20,-142,-124,2,20,20,-124,-124,56,-160,20,-16,2,
+     $ -124,-142,992,-124,-448,56,-268,-88,-160,20,992,-124,38,-106,
+     $ -268,1010,884,-232,-268,884,20,-124,2,20,56,-124,-124,-106,56,
+     $ -88,884,-232,20,200,-124,1028,-106,-88,-88,1028,-232,272,-88,
+     $ -232/
 C     1 T(1,5,2,6,7,3,4)
-      DATA (CF(I,  8),I=  1,  6) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  8),I=  7, 12) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I,  8),I= 13, 18) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,  8),I= 19, 24) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I,  8),I= 25, 30) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  8),I= 31, 36) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I,  8),I= 37, 42) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  8),I= 43, 48) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  8),I= 49, 54) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  8),I= 55, 60) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  8),I= 61, 66) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  8),I= 67, 72) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  8),I= 73, 78) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,  8),I= 79, 84) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  8),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  8),I= 91, 96) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  8),I= 97,102) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,  8),I=103,108) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  8),I=109,114) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  8),I=115,120) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
+      DATA (CF(I),I=820,932) /4096,128,1280,-1024,128,-160,992,-16,
+     $ -160,-448,992,1280,-160,128,-16,992,1136,-16,128,2,20,-16,2,
+     $ -160,1280,20,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142
+     $ ,992,-124,1010,1028,128,-1024,-16,-160,128,-16,-16,128,2,20,-16
+     $ ,2,2,20,20,-124,-124,56,-16,2,-160,20,-142,-124,20,-124,2,20,56
+     $ ,-124,-124,-106,56,-88,884,-232,20,200,-124,1028,-106,-88,-88
+     $ ,1028,-232,272,-88,-232,-160,20,-16,2,-124,-142,992,-124,-448
+     $ ,56,-268,-88,-160,20,992,-124,38,-106,-268,1010,884,-232,-268
+     $ ,884/
 C     1 T(1,5,2,7,6,3,4)
-      DATA (CF(I,  9),I=  1,  6) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I,  9),I=  7, 12) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I,  9),I= 13, 18) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  9),I= 19, 24) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  9),I= 25, 30) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  9),I= 31, 36) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  9),I= 37, 42) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  9),I= 43, 48) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  9),I= 49, 54) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  9),I= 55, 60) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  9),I= 61, 66) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  9),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  9),I= 73, 78) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,  9),I= 79, 84) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  9),I= 85, 90) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,  9),I= 91, 96) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  9),I= 97,102) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I,  9),I=103,108) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  9),I=109,114) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,  9),I=115,120) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=933,1044) /4096,-1024,1280,128,128,-16,-1024,128,
+     $ -160,-16,992,-448,-160,-16,992,-160,-16,2,-160,20,-142,-124,
+     $ -160,20,992,-124,38,-106,992,-124,-448,56,-268,-88,1010,-268,
+     $ -268,884,884,-232,128,-16,-1024,128,-160,-16,-16,2,-160,20,-142
+     $ ,-124,128,-16,-16,2,2,20,20,2,-124,56,20,-124,-16,2,128,-16,20
+     $ ,2,1136,-142,992,-124,1010,1028,1280,-160,-160,20,20,200,38,
+     $ -142,-268,-88,-124,-106,-124,56,20,2,-124,20,1028,-88,-88,-232,
+     $ -232,272,200,20,-106,-88,-124,1028,-106,-124,884,-232,56,-88/
 C     1 T(1,5,6,2,7,3,4)
-      DATA (CF(I, 10),I=  1,  6) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 10),I=  7, 12) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 10),I= 13, 18) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 10),I= 19, 24) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 10),I= 25, 30) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 10),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 10),I= 37, 42) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 10),I= 43, 48) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 10),I= 49, 54) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 10),I= 55, 60) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 10),I= 61, 66) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 10),I= 67, 72) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 10),I= 73, 78) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 10),I= 79, 84) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 10),I= 85, 90) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 10),I= 91, 96) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 10),I= 97,102) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 10),I=103,108) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 10),I=109,114) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 10),I=115,120) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
+      DATA (CF(I),I=1045,1155) /4096,128,-1024,-16,-160,128,-1024,-16
+     $ ,128,1136,992,-16,128,-160,1280,2,20,20,-124,-124,56,20,200,
+     $ -124,1028,-106,-88,-124,-106,56,-88,884,-232,1028,-88,-88,-232,
+     $ -232,272,-16,-160,128,-1024,-16,128,2,20,20,-124,-124,56,-16
+     $ ,128,2,20,-16,2,2,-16,-142,-124,-160,20,2,20,-16,128,2,-16,-142
+     $ ,38,-124,-106,-268,-88,-160,1280,20,200,-160,20,-142,1136,1010
+     $ ,1028,992,-124,-142,-124,2,-16,20,-160,1010,-268,-268,884,884,
+     $ -232,20,-160,38,-106,992,-124,-124,992,-268,-88,-448,56/
 C     1 T(1,5,6,7,2,3,4)
-      DATA (CF(I, 11),I=  1,  6) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 11),I=  7, 12) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 11),I= 13, 18) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 11),I= 19, 24) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 11),I= 25, 30) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 11),I= 31, 36) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 11),I= 37, 42) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 11),I= 43, 48) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 11),I= 49, 54) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 11),I= 55, 60) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 11),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 11),I= 67, 72) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 11),I= 73, 78) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 11),I= 79, 84) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 11),I= 85, 90) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 11),I= 91, 96) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 11),I= 97,102) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 11),I=103,108) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 11),I=109,114) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 11),I=115,120) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
+      DATA (CF(I),I=1156,1265) /4096,-1024,992,-448,-160,-16,992,-160
+     $ ,128,-16,-1024,128,-160,-16,2,-16,-142,-124,-160,20,20,-160,38,
+     $ -106,992,-124,1010,-268,-268,884,884,-232,992,-124,-448,56,-268
+     $ ,-88,-16,128,-160,-16,-1024,128,2,-16,-142,-124,-160,20,20,2,
+     $ -124,56,20,-124,128,-16,-16,2,2,20,-124,56,20,2,-124,20,1028,
+     $ -88,-88,-232,-232,272,200,20,-106,-88,-124,1028,-106,-124,884,
+     $ -232,56,-88,-16,2,128,-16,20,2,1136,-142,992,-124,1010,1028
+     $ ,1280,-160,-160,20,20,200,38,-142,-268,-88,-124,-106/
 C     1 T(1,5,7,2,6,3,4)
-      DATA (CF(I, 12),I=  1,  6) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 12),I=  7, 12) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 12),I= 13, 18) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 12),I= 19, 24) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 12),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 12),I= 31, 36) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 12),I= 37, 42) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 12),I= 43, 48) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 12),I= 49, 54) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 12),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 12),I= 61, 66) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 12),I= 67, 72) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 12),I= 73, 78) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 12),I= 79, 84) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 12),I= 85, 90) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 12),I= 91, 96) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 12),I= 97,102) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 12),I=103,108) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 12),I=109,114) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 12),I=115,120) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
+      DATA (CF(I),I=1266,1374) /4096,1136,992,-16,128,-160,1280,-16,
+     $ -160,128,-1024,-16,128,20,2,-124,56,20,-124,200,20,-106,-88,
+     $ -124,1028,1028,-88,-88,-232,-232,272,-124,-106,56,-88,884,-232,
+     $ -160,-16,-16,128,128,-1024,20,2,-124,56,20,-124,2,-16,-142,-124
+     $ ,-160,20,-16,128,2,20,-16,2,-142,-124,2,-16,20,-160,1010,-268,
+     $ -268,884,884,-232,20,-160,38,-106,992,-124,-124,992,-268,-88,
+     $ -448,56,2,20,-16,128,2,-16,-142,38,-124,-106,-268,-88,-160,1280
+     $ ,20,200,-160,20,-142,1136,1010,1028,992,-124/
 C     1 T(1,5,7,6,2,3,4)
-      DATA (CF(I, 13),I=  1,  6) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 13),I=  7, 12) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 13),I= 13, 18) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 13),I= 19, 24) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 13),I= 25, 30) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 13),I= 31, 36) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 13),I= 37, 42) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 13),I= 43, 48) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 13),I= 49, 54) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 13),I= 55, 60) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 13),I= 61, 66) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 13),I= 67, 72) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 13),I= 73, 78) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 13),I= 79, 84) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 13),I= 85, 90) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 13),I= 91, 96) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 13),I= 97,102) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 13),I=103,108) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 13),I=109,114) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 13),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=1375,1482) /4096,-1024,-1024,128,128,1280,992,-160
+     $ ,-448,992,-16,-160,-16,2,128,-16,20,2,1136,-142,992,-124,1010
+     $ ,1028,1280,-160,-160,20,20,200,38,-142,-268,-88,-124,-106,-160
+     $ ,20,-16,2,-124,-142,992,-124,-448,56,-268,-88,-160,20,992,-124
+     $ ,38,-106,-268,1010,884,-232,-268,884,-1024,128,128,-16,-16,-160
+     $ ,128,-16,-16,2,2,20,-16,2,-160,20,-142,-124,2,20,20,-124,-124
+     $ ,56,-124,20,56,-124,2,20,-106,-124,884,-232,56,-88,-88,1028,
+     $ -232,272,-88,-232,20,200,-124,1028,-106,-88/
 C     1 T(1,6,2,5,7,3,4)
-      DATA (CF(I, 14),I=  1,  6) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 14),I=  7, 12) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 14),I= 13, 18) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 14),I= 19, 24) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 14),I= 25, 30) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 14),I= 31, 36) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 14),I= 37, 42) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 14),I= 43, 48) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 14),I= 49, 54) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 14),I= 55, 60) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 14),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 14),I= 67, 72) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 14),I= 73, 78) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 14),I= 79, 84) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 14),I= 85, 90) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 14),I= 91, 96) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 14),I= 97,102) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 14),I=103,108) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 14),I=109,114) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 14),I=115,120) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
+      DATA (CF(I),I=1483,1589) /4096,128,1280,-1024,128,-160,1280,992
+     $ ,1136,128,-16,2,20,-16,128,2,-16,-142,38,-124,-106,-268,-88,
+     $ -160,1280,20,200,-160,20,-142,1136,1010,1028,992,-124,20,-124,2
+     $ ,20,56,-124,-124,-106,56,-88,884,-232,20,200,-124,1028,-106,-88
+     $ ,-88,1028,-232,272,-88,-232,128,-1024,-16,-160,128,-16,-16,128
+     $ ,2,20,-16,2,2,20,20,-124,-124,56,-16,2,-160,20,-142,-124,20,
+     $ -160,-124,-142,-16,2,-124,992,-268,-88,-448,56,-268,1010,884,
+     $ -232,-268,884,-160,20,992,-124,38,-106/
 C     1 T(1,6,2,7,5,3,4)
-      DATA (CF(I, 15),I=  1,  6) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 15),I=  7, 12) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 15),I= 13, 18) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 15),I= 19, 24) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 15),I= 25, 30) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 15),I= 31, 36) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 15),I= 37, 42) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 15),I= 43, 48) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 15),I= 49, 54) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 15),I= 55, 60) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 15),I= 61, 66) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 15),I= 67, 72) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 15),I= 73, 78) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 15),I= 79, 84) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 15),I= 85, 90) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 15),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 15),I= 97,102) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 15),I=103,108) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 15),I=109,114) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 15),I=115,120) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
+      DATA (CF(I),I=1590,1695) /4096,-1024,1280,128,-448,992,992,-160,
+     $ -160,-16,-160,20,-16,2,-124,-142,992,-124,-448,56,-268,-88,-160
+     $ ,20,992,-124,38,-106,-268,1010,884,-232,-268,884,-16,2,128,-16
+     $ ,20,2,1136,-142,992,-124,1010,1028,1280,-160,-160,20,20,200,38,
+     $ -142,-268,-88,-124,-106,128,-16,-1024,128,-160,-16,-16,2,-160
+     $ ,20,-142,-124,128,-16,-16,2,2,20,20,2,-124,56,20,-124,56,-124,
+     $ -124,20,20,2,-88,1028,-232,272,-88,-232,-106,-124,884,-232,56,
+     $ -88,200,20,-106,-88,-124,1028/
 C     1 T(1,6,5,2,7,3,4)
-      DATA (CF(I, 16),I=  1,  6) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 16),I=  7, 12) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 16),I= 13, 18) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 16),I= 19, 24) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 16),I= 25, 30) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 16),I= 31, 36) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 16),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 16),I= 43, 48) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 16),I= 49, 54) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 16),I= 55, 60) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 16),I= 61, 66) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 16),I= 67, 72) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 16),I= 73, 78) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 16),I= 79, 84) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 16),I= 85, 90) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 16),I= 91, 96) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 16),I= 97,102) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 16),I=103,108) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 16),I=109,114) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 16),I=115,120) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=1696,1800) /4096,128,-1024,992,1136,-160,1280,-16
+     $ ,128,20,-124,2,20,56,-124,-124,-106,56,-88,884,-232,20,200,-124
+     $ ,1028,-106,-88,-88,1028,-232,272,-88,-232,2,20,-16,128,2,-16,
+     $ -142,38,-124,-106,-268,-88,-160,1280,20,200,-160,20,-142,1136
+     $ ,1010,1028,992,-124,-16,-160,128,-1024,-16,128,2,20,20,-124,
+     $ -124,56,-16,128,2,20,-16,2,2,-16,-142,-124,-160,20,-124,-142,20
+     $ ,-160,2,-16,-268,1010,884,-232,-268,884,-124,992,-268,-88,-448
+     $ ,56,20,-160,38,-106,992,-124/
 C     1 T(1,6,5,7,2,3,4)
-      DATA (CF(I, 17),I=  1,  6) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 17),I=  7, 12) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 17),I= 13, 18) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 17),I= 19, 24) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 17),I= 25, 30) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 17),I= 31, 36) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 17),I= 37, 42) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 17),I= 43, 48) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 17),I= 49, 54) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 17),I= 55, 60) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 17),I= 61, 66) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 17),I= 67, 72) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 17),I= 73, 78) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 17),I= 79, 84) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 17),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 17),I= 91, 96) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 17),I= 97,102) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 17),I=103,108) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 17),I=109,114) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 17),I=115,120) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
+      DATA (CF(I),I=1801,1904) /4096,-1024,-16,128,-160,-16,-1024,128,
+     $ -142,-124,2,-16,20,-160,1010,-268,-268,884,884,-232,20,-160,38,
+     $ -106,992,-124,-124,992,-268,-88,-448,56,-124,56,20,2,-124,20
+     $ ,1028,-88,-88,-232,-232,272,200,20,-106,-88,-124,1028,-106,-124
+     $ ,884,-232,56,-88,-16,128,-160,-16,-1024,128,2,-16,-142,-124,
+     $ -160,20,20,2,-124,56,20,-124,128,-16,-16,2,2,20,2,-16,20,2,128,
+     $ -16,-142,1136,1010,1028,992,-124,38,-142,-268,-88,-124,-106
+     $ ,1280,-160,-160,20,20,200/
 C     1 T(1,6,7,2,5,3,4)
-      DATA (CF(I, 18),I=  1,  6) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 18),I=  7, 12) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 18),I= 13, 18) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 18),I= 19, 24) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 18),I= 25, 30) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 18),I= 31, 36) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 18),I= 37, 42) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 18),I= 43, 48) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 18),I= 49, 54) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 18),I= 55, 60) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 18),I= 61, 66) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 18),I= 67, 72) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 18),I= 73, 78) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 18),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 18),I= 85, 90) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 18),I= 91, 96) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 18),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 18),I=103,108) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 18),I=109,114) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 18),I=115,120) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
+      DATA (CF(I),I=1905,2007) /4096,-160,-16,-16,128,128,-1024,-124
+     $ ,56,20,2,-124,20,1028,-88,-88,-232,-232,272,200,20,-106,-88,
+     $ -124,1028,-106,-124,884,-232,56,-88,-142,-124,2,-16,20,-160
+     $ ,1010,-268,-268,884,884,-232,20,-160,38,-106,992,-124,-124,992,
+     $ -268,-88,-448,56,-160,-16,-16,128,128,-1024,20,2,-124,56,20,
+     $ -124,2,-16,-142,-124,-160,20,-16,128,2,20,-16,2,20,2,2,-16,-16
+     $ ,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124,
+     $ -160,1280,20,200,-160,20/
 C     1 T(1,6,7,5,2,3,4)
-      DATA (CF(I, 19),I=  1,  6) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 19),I=  7, 12) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 19),I= 13, 18) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 19),I= 19, 24) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 19),I= 25, 30) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 19),I= 31, 36) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 19),I= 37, 42) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 19),I= 43, 48) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 19),I= 49, 54) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 19),I= 55, 60) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 19),I= 61, 66) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 19),I= 67, 72) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 19),I= 73, 78) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 19),I= 79, 84) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 19),I= 85, 90) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 19),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 19),I= 97,102) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 19),I=103,108) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 19),I=109,114) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 19),I=115,120) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
+      DATA (CF(I),I=2008,2109) /4096,-1024,-1024,128,128,1280,2,-16,20
+     $ ,2,128,-16,-142,1136,1010,1028,992,-124,38,-142,-268,-88,-124,
+     $ -106,1280,-160,-160,20,20,200,20,-160,-124,-142,-16,2,-124,992,
+     $ -268,-88,-448,56,-268,1010,884,-232,-268,884,-160,20,992,-124
+     $ ,38,-106,-124,20,56,-124,2,20,-106,-124,884,-232,56,-88,-88
+     $ ,1028,-232,272,-88,-232,20,200,-124,1028,-106,-88,-1024,128,128
+     $ ,-16,-16,-160,128,-16,-16,2,2,20,-16,2,-160,20,-142,-124,2,20
+     $ ,20,-124,-124,56/
 C     1 T(1,7,2,5,6,3,4)
-      DATA (CF(I, 20),I=  1,  6) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 20),I=  7, 12) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 20),I= 13, 18) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 20),I= 19, 24) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 20),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 20),I= 31, 36) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 20),I= 37, 42) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 20),I= 43, 48) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 20),I= 49, 54) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 20),I= 55, 60) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 20),I= 61, 66) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 20),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 20),I= 73, 78) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 20),I= 79, 84) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 20),I= 85, 90) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 20),I= 91, 96) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 20),I= 97,102) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 20),I=103,108) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 20),I=109,114) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 20),I=115,120) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=2110,2210) /4096,128,1280,-1024,128,20,2,2,-16,-16
+     $ ,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124,
+     $ -160,1280,20,200,-160,20,-124,20,56,-124,2,20,-106,-124,884,
+     $ -232,56,-88,-88,1028,-232,272,-88,-232,20,200,-124,1028,-106,
+     $ -88,20,-160,-124,-142,-16,2,-124,992,-268,-88,-448,56,-268,1010
+     $ ,884,-232,-268,884,-160,20,992,-124,38,-106,128,-1024,-16,-160
+     $ ,128,-16,-16,128,2,20,-16,2,2,20,20,-124,-124,56,-16,2,-160,20,
+     $ -142,-124/
 C     1 T(1,7,2,6,5,3,4)
-      DATA (CF(I, 21),I=  1,  6) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 21),I=  7, 12) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 21),I= 13, 18) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 21),I= 19, 24) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 21),I= 25, 30) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 21),I= 31, 36) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 21),I= 37, 42) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 21),I= 43, 48) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 21),I= 49, 54) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 21),I= 55, 60) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 21),I= 61, 66) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 21),I= 67, 72) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 21),I= 73, 78) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 21),I= 79, 84) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 21),I= 85, 90) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 21),I= 91, 96) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 21),I= 97,102) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 21),I=103,108) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 21),I=109,114) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 21),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=2211,2310) /4096,-1024,1280,128,20,-160,-124,-142,
+     $ -16,2,-124,992,-268,-88,-448,56,-268,1010,884,-232,-268,884,
+     $ -160,20,992,-124,38,-106,2,-16,20,2,128,-16,-142,1136,1010,1028
+     $ ,992,-124,38,-142,-268,-88,-124,-106,1280,-160,-160,20,20,200
+     $ ,56,-124,-124,20,20,2,-88,1028,-232,272,-88,-232,-106,-124,884,
+     $ -232,56,-88,200,20,-106,-88,-124,1028,128,-16,-1024,128,-160,
+     $ -16,-16,2,-160,20,-142,-124,128,-16,-16,2,2,20,20,2,-124,56,20,
+     $ -124/
 C     1 T(1,7,5,2,6,3,4)
-      DATA (CF(I, 22),I=  1,  6) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 22),I=  7, 12) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 22),I= 13, 18) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 22),I= 19, 24) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 22),I= 25, 30) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 22),I= 31, 36) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 22),I= 37, 42) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 22),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 22),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 22),I= 55, 60) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 22),I= 61, 66) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 22),I= 67, 72) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 22),I= 73, 78) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 22),I= 79, 84) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 22),I= 85, 90) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 22),I= 91, 96) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 22),I= 97,102) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 22),I=103,108) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 22),I=109,114) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 22),I=115,120) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
+      DATA (CF(I),I=2311,2409) /4096,128,-1024,-124,20,56,-124,2,20,
+     $ -106,-124,884,-232,56,-88,-88,1028,-232,272,-88,-232,20,200,
+     $ -124,1028,-106,-88,20,2,2,-16,-16,128,38,-142,-268,-88,-124,
+     $ -106,-142,1136,1010,1028,992,-124,-160,1280,20,200,-160,20,-124
+     $ ,-142,20,-160,2,-16,-268,1010,884,-232,-268,884,-124,992,-268,
+     $ -88,-448,56,20,-160,38,-106,992,-124,-16,-160,128,-1024,-16,128
+     $ ,2,20,20,-124,-124,56,-16,128,2,20,-16,2,2,-16,-142,-124,-160
+     $ ,20/
 C     1 T(1,7,5,6,2,3,4)
-      DATA (CF(I, 23),I=  1,  6) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 23),I=  7, 12) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 23),I= 13, 18) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 23),I= 19, 24) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 23),I= 25, 30) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 23),I= 31, 36) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 23),I= 37, 42) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 23),I= 43, 48) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 23),I= 49, 54) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 23),I= 55, 60) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 23),I= 61, 66) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 23),I= 67, 72) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 23),I= 73, 78) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 23),I= 79, 84) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 23),I= 85, 90) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 23),I= 91, 96) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 23),I= 97,102) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 23),I=103,108) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 23),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 23),I=115,120) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=2410,2507) /4096,-1024,-124,-142,20,-160,2,-16,
+     $ -268,1010,884,-232,-268,884,-124,992,-268,-88,-448,56,20,-160
+     $ ,38,-106,992,-124,56,-124,-124,20,20,2,-88,1028,-232,272,-88,
+     $ -232,-106,-124,884,-232,56,-88,200,20,-106,-88,-124,1028,2,-16
+     $ ,20,2,128,-16,-142,1136,1010,1028,992,-124,38,-142,-268,-88,
+     $ -124,-106,1280,-160,-160,20,20,200,-16,128,-160,-16,-1024,128,2
+     $ ,-16,-142,-124,-160,20,20,2,-124,56,20,-124,128,-16,-16,2,2,20/
 C     1 T(1,7,6,2,5,3,4)
-      DATA (CF(I, 24),I=  1,  6) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 24),I=  7, 12) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 24),I= 13, 18) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 24),I= 19, 24) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 24),I= 25, 30) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 24),I= 31, 36) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 24),I= 37, 42) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 24),I= 43, 48) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 24),I= 49, 54) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 24),I= 55, 60) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 24),I= 61, 66) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 24),I= 67, 72) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 24),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 24),I= 79, 84) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 24),I= 85, 90) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 24),I= 91, 96) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 24),I= 97,102) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 24),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 24),I=109,114) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 24),I=115,120) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
+      DATA (CF(I),I=2508,2604) /4096,56,-124,-124,20,20,2,-88,1028,
+     $ -232,272,-88,-232,-106,-124,884,-232,56,-88,200,20,-106,-88,
+     $ -124,1028,-124,-142,20,-160,2,-16,-268,1010,884,-232,-268,884,
+     $ -124,992,-268,-88,-448,56,20,-160,38,-106,992,-124,20,2,2,-16,
+     $ -16,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124
+     $ ,-160,1280,20,200,-160,20,-160,-16,-16,128,128,-1024,20,2,-124
+     $ ,56,20,-124,2,-16,-142,-124,-160,20,-16,128,2,20,-16,2/
 C     1 T(1,7,6,5,2,3,4)
-      DATA (CF(I, 25),I=  1,  6) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 25),I=  7, 12) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 25),I= 13, 18) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 25),I= 19, 24) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 25),I= 25, 30) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 25),I= 31, 36) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 25),I= 37, 42) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 25),I= 43, 48) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 25),I= 49, 54) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 25),I= 55, 60) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 25),I= 61, 66) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 25),I= 67, 72) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 25),I= 73, 78) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 25),I= 79, 84) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 25),I= 85, 90) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 25),I= 91, 96) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 25),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 25),I=103,108) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 25),I=109,114) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 25),I=115,120) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
+      DATA (CF(I),I=2605,2700) /4096,-1024,-1024,128,128,1280,-1024
+     $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160
+     $ ,992,992,-448,1280,-160,-160,20,20,200,128,-16,-16,2,2,20,992,
+     $ -124,1136,-142,1028,1010,-124,-106,-142,38,-88,-268,-160,20,992
+     $ ,-124,38,-106,-16,2,-160,20,-142,-124,-448,56,992,-124,-88,-268
+     $ ,-268,884,1010,-268,-232,884,20,200,-124,1028,-106,-88,2,20,20,
+     $ -124,-124,56,56,-88,-124,-106,-232,884,-88,-232,1028,-88,272,
+     $ -232/
 C     1 T(2,1,5,6,7,3,4)
-      DATA (CF(I, 26),I=  1,  6) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 26),I=  7, 12) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 26),I= 13, 18) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 26),I= 19, 24) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 26),I= 25, 30) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 26),I= 31, 36) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 26),I= 37, 42) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 26),I= 43, 48) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 26),I= 49, 54) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 26),I= 55, 60) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 26),I= 61, 66) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 26),I= 67, 72) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 26),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 26),I= 79, 84) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 26),I= 85, 90) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 26),I= 91, 96) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 26),I= 97,102) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 26),I=103,108) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 26),I=109,114) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 26),I=115,120) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
+      DATA (CF(I),I=2701,2795) /4096,128,1280,-1024,128,128,-1024,-16,
+     $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136
+     $ ,992,-160,1280,20,200,-160,20,-16,128,2,20,-16,2,-124,-106,-142
+     $ ,38,-88,-268,992,-124,1136,-142,1028,1010,20,200,-124,1028,-106
+     $ ,-88,2,20,20,-124,-124,56,56,-88,-124,-106,-232,884,-88,-232
+     $ ,1028,-88,272,-232,-160,20,992,-124,38,-106,-16,2,-160,20,-142,
+     $ -124,-448,56,992,-124,-88,-268,-268,884,1010,-268,-232,884/
 C     1 T(2,1,5,7,6,3,4)
-      DATA (CF(I, 27),I=  1,  6) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 27),I=  7, 12) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 27),I= 13, 18) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 27),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 27),I= 25, 30) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 27),I= 31, 36) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 27),I= 37, 42) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 27),I= 43, 48) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 27),I= 49, 54) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 27),I= 55, 60) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 27),I= 61, 66) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 27),I= 67, 72) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 27),I= 73, 78) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 27),I= 79, 84) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 27),I= 85, 90) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 27),I= 91, 96) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 27),I= 97,102) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 27),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 27),I=109,114) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 27),I=115,120) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
+      DATA (CF(I),I=2796,2889) /4096,-1024,1280,128,128,-16,1280,-160
+     $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992
+     $ ,-160,20,992,-124,38,-106,-16,2,-160,20,-142,-124,-448,56,992,
+     $ -124,-88,-268,-268,884,1010,-268,-232,884,1280,-160,-160,20,20
+     $ ,200,128,-16,-16,2,2,20,992,-124,1136,-142,1028,1010,-124,-106,
+     $ -142,38,-88,-268,200,20,-106,-88,-124,1028,20,2,-124,56,20,-124
+     $ ,-88,-232,1028,-88,272,-232,56,-88,-124,-106,-232,884/
 C     1 T(2,1,6,5,7,3,4)
-      DATA (CF(I, 28),I=  1,  6) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 28),I=  7, 12) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 28),I= 13, 18) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 28),I= 19, 24) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 28),I= 25, 30) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 28),I= 31, 36) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 28),I= 37, 42) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 28),I= 43, 48) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 28),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 28),I= 55, 60) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 28),I= 61, 66) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 28),I= 67, 72) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 28),I= 73, 78) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 28),I= 79, 84) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 28),I= 85, 90) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 28),I= 91, 96) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 28),I= 97,102) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 28),I=103,108) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 28),I=109,114) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 28),I=115,120) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
+      DATA (CF(I),I=2890,2982) /4096,128,-1024,-16,-160,-160,992,992,
+     $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,20
+     $ ,200,-124,1028,-106,-88,2,20,20,-124,-124,56,56,-88,-124,-106,
+     $ -232,884,-88,-232,1028,-88,272,-232,-160,1280,20,200,-160,20,
+     $ -16,128,2,20,-16,2,-124,-106,-142,38,-88,-268,992,-124,1136,
+     $ -142,1028,1010,20,-160,38,-106,992,-124,2,-16,-142,-124,-160,20
+     $ ,-268,884,1010,-268,-232,884,-448,56,992,-124,-88,-268/
 C     1 T(2,1,6,7,5,3,4)
-      DATA (CF(I, 29),I=  1,  6) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 29),I=  7, 12) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 29),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 29),I= 19, 24) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 29),I= 25, 30) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 29),I= 31, 36) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 29),I= 37, 42) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 29),I= 43, 48) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 29),I= 49, 54) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 29),I= 55, 60) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 29),I= 61, 66) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 29),I= 67, 72) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 29),I= 73, 78) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 29),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 29),I= 85, 90) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 29),I= 91, 96) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 29),I= 97,102) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 29),I=103,108) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 29),I=109,114) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 29),I=115,120) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
+      DATA (CF(I),I=2983,3074) /4096,-1024,-16,128,1136,992,1280,-160,
+     $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,20,-160
+     $ ,38,-106,992,-124,2,-16,-142,-124,-160,20,-268,884,1010,-268,
+     $ -232,884,-448,56,992,-124,-88,-268,200,20,-106,-88,-124,1028,20
+     $ ,2,-124,56,20,-124,-88,-232,1028,-88,272,-232,56,-88,-124,-106,
+     $ -232,884,1280,-160,-160,20,20,200,128,-16,-16,2,2,20,992,-124
+     $ ,1136,-142,1028,1010,-124,-106,-142,38,-88,-268/
 C     1 T(2,1,7,5,6,3,4)
-      DATA (CF(I, 30),I=  1,  6) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 30),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 30),I= 13, 18) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 30),I= 19, 24) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 30),I= 25, 30) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 30),I= 31, 36) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 30),I= 37, 42) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 30),I= 43, 48) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 30),I= 49, 54) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 30),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 30),I= 61, 66) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 30),I= 67, 72) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 30),I= 73, 78) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 30),I= 79, 84) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 30),I= 85, 90) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 30),I= 91, 96) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 30),I= 97,102) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 30),I=103,108) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 30),I=109,114) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 30),I=115,120) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
+      DATA (CF(I),I=3075,3165) /4096,-160,-16,992,-448,-160,992,-16
+     $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,200,20,-106,
+     $ -88,-124,1028,20,2,-124,56,20,-124,-88,-232,1028,-88,272,-232
+     $ ,56,-88,-124,-106,-232,884,20,-160,38,-106,992,-124,2,-16,-142,
+     $ -124,-160,20,-268,884,1010,-268,-232,884,-448,56,992,-124,-88,
+     $ -268,-160,1280,20,200,-160,20,-16,128,2,20,-16,2,-124,-106,-142
+     $ ,38,-88,-268,992,-124,1136,-142,1028,1010/
 C     1 T(2,1,7,6,5,3,4)
-      DATA (CF(I, 31),I=  1,  6) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 31),I=  7, 12) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 31),I= 13, 18) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 31),I= 19, 24) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 31),I= 25, 30) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 31),I= 31, 36) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 31),I= 37, 42) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 31),I= 43, 48) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 31),I= 49, 54) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 31),I= 55, 60) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 31),I= 61, 66) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 31),I= 67, 72) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 31),I= 73, 78) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 31),I= 79, 84) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 31),I= 85, 90) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 31),I= 91, 96) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 31),I= 97,102) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 31),I=103,108) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 31),I=109,114) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 31),I=115,120) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=3166,3255) /4096,-1024,-1024,128,128,1280,1280,
+     $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,128,-16,-16,2
+     $ ,2,20,-1024,128,128,-16,-16,-160,-160,20,-16,2,-124,-142,20,
+     $ -124,2,20,56,-124,992,-124,-448,56,-268,-88,-160,20,-16,2,-124,
+     $ -142,992,-124,-160,20,-106,38,884,-232,-268,1010,884,-268,-124,
+     $ -106,56,-88,884,-232,20,-124,2,20,56,-124,-124,1028,20,200,-88,
+     $ -106,-232,272,-88,1028,-232,-88/
 C     1 T(2,5,1,6,7,3,4)
-      DATA (CF(I, 32),I=  1,  6) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 32),I=  7, 12) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 32),I= 13, 18) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 32),I= 19, 24) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 32),I= 25, 30) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 32),I= 31, 36) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 32),I= 37, 42) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 32),I= 43, 48) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 32),I= 49, 54) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 32),I= 55, 60) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 32),I= 61, 66) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 32),I= 67, 72) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 32),I= 73, 78) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 32),I= 79, 84) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 32),I= 85, 90) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 32),I= 91, 96) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 32),I= 97,102) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 32),I=103,108) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 32),I=109,114) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 32),I=115,120) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
+      DATA (CF(I),I=3256,3344) /4096,128,1280,-1024,128,-160,992,-16,
+     $ -160,-448,992,1280,-160,128,-16,992,1136,-16,128,2,20,-16,2,128
+     $ ,-1024,-16,-160,128,-16,20,-124,2,20,56,-124,-160,20,-16,2,-124
+     $ ,-142,-124,-106,56,-88,884,-232,20,-124,2,20,56,-124,-124,1028
+     $ ,20,200,-88,-106,-232,272,-88,1028,-232,-88,992,-124,-448,56,
+     $ -268,-88,-160,20,-16,2,-124,-142,992,-124,-160,20,-106,38,884,
+     $ -232,-268,1010,884,-268/
 C     1 T(2,5,1,7,6,3,4)
-      DATA (CF(I, 33),I=  1,  6) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 33),I=  7, 12) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 33),I= 13, 18) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 33),I= 19, 24) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 33),I= 25, 30) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 33),I= 31, 36) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 33),I= 37, 42) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 33),I= 43, 48) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 33),I= 49, 54) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 33),I= 55, 60) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 33),I= 61, 66) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 33),I= 67, 72) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 33),I= 73, 78) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 33),I= 79, 84) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 33),I= 85, 90) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 33),I= 91, 96) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 33),I= 97,102) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 33),I=103,108) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 33),I=109,114) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 33),I=115,120) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
+      DATA (CF(I),I=3345,3432) /4096,-1024,1280,128,128,-16,-1024,128,
+     $ -160,-16,992,-448,-160,-16,992,-160,-16,2,-160,20,-142,-124,128
+     $ ,-16,-1024,128,-160,-16,-16,2,128,-16,20,2,-124,56,20,2,-124,20
+     $ ,1136,-142,992,-124,1010,1028,-16,2,128,-16,20,2,-160,20,1280,
+     $ -160,200,20,-268,-88,38,-142,-106,-124,1028,-88,-88,-232,-232
+     $ ,272,-124,56,20,2,-124,20,-106,-88,200,20,1028,-124,884,-232,
+     $ -106,-124,-88,56/
 C     1 T(2,5,6,1,7,3,4)
-      DATA (CF(I, 34),I=  1,  6) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 34),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 34),I= 13, 18) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 34),I= 19, 24) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 34),I= 25, 30) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 34),I= 31, 36) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 34),I= 37, 42) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 34),I= 43, 48) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 34),I= 49, 54) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 34),I= 55, 60) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 34),I= 61, 66) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 34),I= 67, 72) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 34),I= 73, 78) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 34),I= 79, 84) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 34),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 34),I= 91, 96) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 34),I= 97,102) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 34),I=103,108) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 34),I=109,114) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 34),I=115,120) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
+      DATA (CF(I),I=3433,3519) /4096,128,-1024,-16,-160,128,-1024,-16
+     $ ,128,1136,992,-16,128,-160,1280,2,20,20,-124,-124,56,-16,-160
+     $ ,128,-1024,-16,128,2,20,-16,128,2,-16,-142,-124,2,-16,20,-160,
+     $ -142,38,-124,-106,-268,-88,2,20,-16,128,2,-16,20,200,-160,1280
+     $ ,20,-160,1010,1028,-142,1136,-124,992,1010,-268,-268,884,884,
+     $ -232,-142,-124,2,-16,20,-160,38,-106,20,-160,-124,992,-268,-88,
+     $ -124,992,56,-448/
 C     1 T(2,5,6,7,1,3,4)
-      DATA (CF(I, 35),I=  1,  6) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 35),I=  7, 12) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 35),I= 13, 18) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 35),I= 19, 24) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 35),I= 25, 30) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 35),I= 31, 36) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 35),I= 37, 42) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 35),I= 43, 48) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 35),I= 49, 54) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 35),I= 55, 60) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 35),I= 61, 66) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 35),I= 67, 72) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 35),I= 73, 78) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 35),I= 79, 84) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 35),I= 85, 90) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 35),I= 91, 96) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 35),I= 97,102) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 35),I=103,108) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 35),I=109,114) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 35),I=115,120) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=3520,3605) /4096,-1024,992,-448,-160,-16,992,-160
+     $ ,128,-16,-1024,128,-160,-16,2,-16,-142,-124,-160,20,-16,128,
+     $ -160,-16,-1024,128,-124,56,20,2,-124,20,-16,2,128,-16,20,2,1028
+     $ ,-88,-88,-232,-232,272,-124,56,20,2,-124,20,-106,-88,200,20
+     $ ,1028,-124,884,-232,-106,-124,-88,56,1136,-142,992,-124,1010
+     $ ,1028,-16,2,128,-16,20,2,-160,20,1280,-160,200,20,-268,-88,38,
+     $ -142,-106,-124/
 C     1 T(2,5,7,1,6,3,4)
-      DATA (CF(I, 36),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 36),I=  7, 12) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 36),I= 13, 18) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 36),I= 19, 24) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 36),I= 25, 30) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 36),I= 31, 36) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 36),I= 37, 42) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 36),I= 43, 48) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 36),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 36),I= 55, 60) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 36),I= 61, 66) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 36),I= 67, 72) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 36),I= 73, 78) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 36),I= 79, 84) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 36),I= 85, 90) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 36),I= 91, 96) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 36),I= 97,102) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 36),I=103,108) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 36),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 36),I=115,120) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
+      DATA (CF(I),I=3606,3690) /4096,1136,992,-16,128,-160,1280,-16,
+     $ -160,128,-1024,-16,128,20,2,-124,56,20,-124,-160,-16,-16,128
+     $ ,128,-1024,-142,-124,2,-16,20,-160,2,20,-16,128,2,-16,1010,-268
+     $ ,-268,884,884,-232,-142,-124,2,-16,20,-160,38,-106,20,-160,-124
+     $ ,992,-268,-88,-124,992,56,-448,-142,38,-124,-106,-268,-88,2,20,
+     $ -16,128,2,-16,20,200,-160,1280,20,-160,1010,1028,-142,1136,-124
+     $ ,992/
 C     1 T(2,5,7,6,1,3,4)
-      DATA (CF(I, 37),I=  1,  6) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 37),I=  7, 12) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 37),I= 13, 18) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 37),I= 19, 24) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 37),I= 25, 30) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 37),I= 31, 36) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 37),I= 37, 42) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 37),I= 43, 48) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 37),I= 49, 54) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 37),I= 55, 60) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 37),I= 61, 66) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 37),I= 67, 72) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 37),I= 73, 78) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 37),I= 79, 84) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 37),I= 85, 90) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 37),I= 91, 96) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 37),I= 97,102) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 37),I=103,108) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 37),I=109,114) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 37),I=115,120) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
+      DATA (CF(I),I=3691,3774) /4096,-1024,-1024,128,128,1280,992,-160
+     $ ,-448,992,-16,-160,992,-124,-448,56,-268,-88,-160,20,-16,2,-124
+     $ ,-142,992,-124,-160,20,-106,38,884,-232,-268,1010,884,-268,128,
+     $ -16,-16,2,2,20,-1024,128,128,-16,-16,-160,-160,20,-16,2,-124,
+     $ -142,20,-124,2,20,56,-124,-106,-124,884,-232,56,-88,-124,20,56,
+     $ -124,2,20,-232,272,-88,1028,-232,-88,-124,1028,20,200,-88,-106/
 C     1 T(2,6,1,5,7,3,4)
-      DATA (CF(I, 38),I=  1,  6) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 38),I=  7, 12) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 38),I= 13, 18) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 38),I= 19, 24) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 38),I= 25, 30) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 38),I= 31, 36) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 38),I= 37, 42) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 38),I= 43, 48) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 38),I= 49, 54) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 38),I= 55, 60) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 38),I= 61, 66) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 38),I= 67, 72) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 38),I= 73, 78) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 38),I= 79, 84) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 38),I= 85, 90) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 38),I= 91, 96) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 38),I= 97,102) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 38),I=103,108) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 38),I=109,114) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 38),I=115,120) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
+      DATA (CF(I),I=3775,3857) /4096,128,1280,-1024,128,-160,1280,992
+     $ ,1136,128,-16,-124,-106,56,-88,884,-232,20,-124,2,20,56,-124,
+     $ -124,1028,20,200,-88,-106,-232,272,-88,1028,-232,-88,-16,128,2
+     $ ,20,-16,2,128,-1024,-16,-160,128,-16,20,-124,2,20,56,-124,-160
+     $ ,20,-16,2,-124,-142,-124,992,-268,-88,-448,56,20,-160,-124,-142
+     $ ,-16,2,884,-232,-268,1010,884,-268,992,-124,-160,20,-106,38/
 C     1 T(2,6,1,7,5,3,4)
-      DATA (CF(I, 39),I=  1,  6) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 39),I=  7, 12) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 39),I= 13, 18) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 39),I= 19, 24) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 39),I= 25, 30) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 39),I= 31, 36) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 39),I= 37, 42) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 39),I= 43, 48) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 39),I= 49, 54) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 39),I= 55, 60) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 39),I= 61, 66) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 39),I= 67, 72) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 39),I= 73, 78) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 39),I= 79, 84) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 39),I= 85, 90) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 39),I= 91, 96) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 39),I= 97,102) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 39),I=103,108) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 39),I=109,114) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 39),I=115,120) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
+      DATA (CF(I),I=3858,3939) /4096,-1024,1280,128,-448,992,992,-160,
+     $ -160,-16,1136,-142,992,-124,1010,1028,-16,2,128,-16,20,2,-160
+     $ ,20,1280,-160,200,20,-268,-88,38,-142,-106,-124,-16,2,-160,20,
+     $ -142,-124,128,-16,-1024,128,-160,-16,-16,2,128,-16,20,2,-124,56
+     $ ,20,2,-124,20,-88,1028,-232,272,-88,-232,56,-124,-124,20,20,2
+     $ ,884,-232,-106,-124,-88,56,-106,-88,200,20,1028,-124/
 C     1 T(2,6,5,1,7,3,4)
-      DATA (CF(I, 40),I=  1,  6) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 40),I=  7, 12) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 40),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 40),I= 19, 24) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 40),I= 25, 30) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 40),I= 31, 36) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 40),I= 37, 42) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 40),I= 43, 48) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 40),I= 49, 54) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 40),I= 55, 60) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 40),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 40),I= 67, 72) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 40),I= 73, 78) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 40),I= 79, 84) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 40),I= 85, 90) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 40),I= 91, 96) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 40),I= 97,102) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 40),I=103,108) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 40),I=109,114) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 40),I=115,120) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
+      DATA (CF(I),I=3940,4020) /4096,128,-1024,992,1136,-160,1280,-16
+     $ ,128,-142,38,-124,-106,-268,-88,2,20,-16,128,2,-16,20,200,-160
+     $ ,1280,20,-160,1010,1028,-142,1136,-124,992,2,20,20,-124,-124,56
+     $ ,-16,-160,128,-1024,-16,128,2,20,-16,128,2,-16,-142,-124,2,-16
+     $ ,20,-160,-268,1010,884,-232,-268,884,-124,-142,20,-160,2,-16,
+     $ -268,-88,-124,992,56,-448,38,-106,20,-160,-124,992/
 C     1 T(2,6,5,7,1,3,4)
-      DATA (CF(I, 41),I=  1,  6) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 41),I=  7, 12) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 41),I= 13, 18) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 41),I= 19, 24) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 41),I= 25, 30) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 41),I= 31, 36) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 41),I= 37, 42) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 41),I= 43, 48) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 41),I= 49, 54) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 41),I= 55, 60) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 41),I= 61, 66) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 41),I= 67, 72) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 41),I= 73, 78) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 41),I= 79, 84) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 41),I= 85, 90) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 41),I= 91, 96) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 41),I= 97,102) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 41),I=103,108) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 41),I=109,114) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 41),I=115,120) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=4021,4100) /4096,-1024,-16,128,-160,-16,-1024,128
+     $ ,1028,-88,-88,-232,-232,272,-124,56,20,2,-124,20,-106,-88,200
+     $ ,20,1028,-124,884,-232,-106,-124,-88,56,2,-16,-142,-124,-160,20
+     $ ,-16,128,-160,-16,-1024,128,-124,56,20,2,-124,20,-16,2,128,-16
+     $ ,20,2,-142,1136,1010,1028,992,-124,2,-16,20,2,128,-16,-268,-88
+     $ ,38,-142,-106,-124,-160,20,1280,-160,200,20/
 C     1 T(2,6,7,1,5,3,4)
-      DATA (CF(I, 42),I=  1,  6) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 42),I=  7, 12) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 42),I= 13, 18) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 42),I= 19, 24) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 42),I= 25, 30) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 42),I= 31, 36) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 42),I= 37, 42) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 42),I= 43, 48) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 42),I= 49, 54) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 42),I= 55, 60) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 42),I= 61, 66) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 42),I= 67, 72) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 42),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 42),I= 79, 84) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 42),I= 85, 90) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 42),I= 91, 96) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 42),I= 97,102) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 42),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 42),I=109,114) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 42),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=4101,4179) /4096,-160,-16,-16,128,128,-1024,1010,
+     $ -268,-268,884,884,-232,-142,-124,2,-16,20,-160,38,-106,20,-160,
+     $ -124,992,-268,-88,-124,992,56,-448,20,2,-124,56,20,-124,-160,
+     $ -16,-16,128,128,-1024,-142,-124,2,-16,20,-160,2,20,-16,128,2,
+     $ -16,38,-142,-268,-88,-124,-106,20,2,2,-16,-16,128,1010,1028,
+     $ -142,1136,-124,992,20,200,-160,1280,20,-160/
 C     1 T(2,6,7,5,1,3,4)
-      DATA (CF(I, 43),I=  1,  6) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 43),I=  7, 12) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 43),I= 13, 18) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 43),I= 19, 24) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 43),I= 25, 30) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 43),I= 31, 36) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 43),I= 37, 42) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 43),I= 43, 48) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 43),I= 49, 54) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 43),I= 55, 60) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 43),I= 61, 66) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 43),I= 67, 72) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 43),I= 73, 78) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 43),I= 79, 84) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 43),I= 85, 90) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 43),I= 91, 96) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 43),I= 97,102) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 43),I=103,108) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 43),I=109,114) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 43),I=115,120) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
+      DATA (CF(I),I=4180,4257) /4096,-1024,-1024,128,128,1280,-124,992
+     $ ,-268,-88,-448,56,20,-160,-124,-142,-16,2,884,-232,-268,1010
+     $ ,884,-268,992,-124,-160,20,-106,38,-106,-124,884,-232,56,-88,
+     $ -124,20,56,-124,2,20,-232,272,-88,1028,-232,-88,-124,1028,20
+     $ ,200,-88,-106,128,-16,-16,2,2,20,-1024,128,128,-16,-16,-160,
+     $ -160,20,-16,2,-124,-142,20,-124,2,20,56,-124/
 C     1 T(2,7,1,5,6,3,4)
-      DATA (CF(I, 44),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 44),I=  7, 12) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 44),I= 13, 18) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 44),I= 19, 24) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 44),I= 25, 30) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 44),I= 31, 36) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 44),I= 37, 42) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 44),I= 43, 48) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 44),I= 49, 54) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 44),I= 55, 60) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 44),I= 61, 66) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 44),I= 67, 72) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 44),I= 73, 78) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 44),I= 79, 84) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 44),I= 85, 90) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 44),I= 91, 96) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 44),I= 97,102) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 44),I=103,108) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 44),I=109,114) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 44),I=115,120) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
+      DATA (CF(I),I=4258,4334) /4096,128,1280,-1024,128,-106,-124,884,
+     $ -232,56,-88,-124,20,56,-124,2,20,-232,272,-88,1028,-232,-88,
+     $ -124,1028,20,200,-88,-106,-124,992,-268,-88,-448,56,20,-160,
+     $ -124,-142,-16,2,884,-232,-268,1010,884,-268,992,-124,-160,20,
+     $ -106,38,-16,128,2,20,-16,2,128,-1024,-16,-160,128,-16,20,-124,2
+     $ ,20,56,-124,-160,20,-16,2,-124,-142/
 C     1 T(2,7,1,6,5,3,4)
-      DATA (CF(I, 45),I=  1,  6) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 45),I=  7, 12) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 45),I= 13, 18) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 45),I= 19, 24) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 45),I= 25, 30) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 45),I= 31, 36) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 45),I= 37, 42) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 45),I= 43, 48) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 45),I= 49, 54) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 45),I= 55, 60) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 45),I= 61, 66) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 45),I= 67, 72) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 45),I= 73, 78) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 45),I= 79, 84) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 45),I= 85, 90) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 45),I= 91, 96) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 45),I= 97,102) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 45),I=103,108) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 45),I=109,114) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 45),I=115,120) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
+      DATA (CF(I),I=4335,4410) /4096,-1024,1280,128,-142,1136,1010
+     $ ,1028,992,-124,2,-16,20,2,128,-16,-268,-88,38,-142,-106,-124,
+     $ -160,20,1280,-160,200,20,-88,1028,-232,272,-88,-232,56,-124,
+     $ -124,20,20,2,884,-232,-106,-124,-88,56,-106,-88,200,20,1028,
+     $ -124,-16,2,-160,20,-142,-124,128,-16,-1024,128,-160,-16,-16,2
+     $ ,128,-16,20,2,-124,56,20,2,-124,20/
 C     1 T(2,7,5,1,6,3,4)
-      DATA (CF(I, 46),I=  1,  6) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 46),I=  7, 12) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 46),I= 13, 18) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 46),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 46),I= 25, 30) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 46),I= 31, 36) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 46),I= 37, 42) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 46),I= 43, 48) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 46),I= 49, 54) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 46),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 46),I= 61, 66) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 46),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 46),I= 73, 78) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 46),I= 79, 84) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 46),I= 85, 90) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 46),I= 91, 96) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 46),I= 97,102) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 46),I=103,108) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 46),I=109,114) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 46),I=115,120) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=4411,4485) /4096,128,-1024,38,-142,-268,-88,-124,
+     $ -106,20,2,2,-16,-16,128,1010,1028,-142,1136,-124,992,20,200,
+     $ -160,1280,20,-160,-268,1010,884,-232,-268,884,-124,-142,20,-160
+     $ ,2,-16,-268,-88,-124,992,56,-448,38,-106,20,-160,-124,992,2,20
+     $ ,20,-124,-124,56,-16,-160,128,-1024,-16,128,2,20,-16,128,2,-16,
+     $ -142,-124,2,-16,20,-160/
 C     1 T(2,7,5,6,1,3,4)
-      DATA (CF(I, 47),I=  1,  6) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 47),I=  7, 12) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 47),I= 13, 18) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 47),I= 19, 24) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 47),I= 25, 30) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 47),I= 31, 36) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 47),I= 37, 42) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 47),I= 43, 48) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 47),I= 49, 54) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 47),I= 55, 60) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 47),I= 61, 66) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 47),I= 67, 72) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 47),I= 73, 78) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 47),I= 79, 84) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 47),I= 85, 90) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 47),I= 91, 96) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 47),I= 97,102) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 47),I=103,108) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 47),I=109,114) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 47),I=115,120) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
+      DATA (CF(I),I=4486,4559) /4096,-1024,-88,1028,-232,272,-88,-232
+     $ ,56,-124,-124,20,20,2,884,-232,-106,-124,-88,56,-106,-88,200,20
+     $ ,1028,-124,-142,1136,1010,1028,992,-124,2,-16,20,2,128,-16,-268
+     $ ,-88,38,-142,-106,-124,-160,20,1280,-160,200,20,2,-16,-142,-124
+     $ ,-160,20,-16,128,-160,-16,-1024,128,-124,56,20,2,-124,20,-16,2
+     $ ,128,-16,20,2/
 C     1 T(2,7,6,1,5,3,4)
-      DATA (CF(I, 48),I=  1,  6) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 48),I=  7, 12) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 48),I= 13, 18) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 48),I= 19, 24) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 48),I= 25, 30) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 48),I= 31, 36) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 48),I= 37, 42) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 48),I= 43, 48) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 48),I= 49, 54) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 48),I= 55, 60) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 48),I= 61, 66) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 48),I= 67, 72) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 48),I= 73, 78) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 48),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 48),I= 85, 90) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 48),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 48),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 48),I=103,108) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 48),I=109,114) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 48),I=115,120) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=4560,4632) /4096,-268,1010,884,-232,-268,884,-124,
+     $ -142,20,-160,2,-16,-268,-88,-124,992,56,-448,38,-106,20,-160,
+     $ -124,992,38,-142,-268,-88,-124,-106,20,2,2,-16,-16,128,1010
+     $ ,1028,-142,1136,-124,992,20,200,-160,1280,20,-160,20,2,-124,56
+     $ ,20,-124,-160,-16,-16,128,128,-1024,-142,-124,2,-16,20,-160,2
+     $ ,20,-16,128,2,-16/
 C     1 T(2,7,6,5,1,3,4)
-      DATA (CF(I, 49),I=  1,  6) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 49),I=  7, 12) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 49),I= 13, 18) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 49),I= 19, 24) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 49),I= 25, 30) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 49),I= 31, 36) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 49),I= 37, 42) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 49),I= 43, 48) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 49),I= 49, 54) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 49),I= 55, 60) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 49),I= 61, 66) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 49),I= 67, 72) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 49),I= 73, 78) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 49),I= 79, 84) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 49),I= 85, 90) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 49),I= 91, 96) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 49),I= 97,102) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 49),I=103,108) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 49),I=109,114) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 49),I=115,120) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=4633,4704) /4096,-1024,-1024,128,128,1280,-1024
+     $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160
+     $ ,992,992,-448,992,-124,-160,20,-106,38,-448,56,992,-124,-88,
+     $ -268,-16,2,-160,20,-142,-124,884,-268,-232,884,1010,-268,-124
+     $ ,1028,20,200,-88,-106,56,-88,-124,-106,-232,884,2,20,20,-124,
+     $ -124,56,-232,-88,272,-232,1028,-88/
 C     1 T(5,1,2,6,7,3,4)
-      DATA (CF(I, 50),I=  1,  6) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 50),I=  7, 12) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 50),I= 13, 18) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 50),I= 19, 24) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 50),I= 25, 30) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 50),I= 31, 36) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 50),I= 37, 42) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 50),I= 43, 48) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 50),I= 49, 54) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 50),I= 55, 60) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 50),I= 61, 66) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 50),I= 67, 72) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 50),I= 73, 78) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 50),I= 79, 84) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 50),I= 85, 90) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 50),I= 91, 96) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 50),I= 97,102) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 50),I=103,108) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 50),I=109,114) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 50),I=115,120) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
+      DATA (CF(I),I=4705,4775) /4096,128,1280,-1024,128,128,-1024,-16,
+     $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136
+     $ ,992,-124,1028,20,200,-88,-106,56,-88,-124,-106,-232,884,2,20
+     $ ,20,-124,-124,56,-232,-88,272,-232,1028,-88,992,-124,-160,20,
+     $ -106,38,-448,56,992,-124,-88,-268,-16,2,-160,20,-142,-124,884,
+     $ -268,-232,884,1010,-268/
 C     1 T(5,1,2,7,6,3,4)
-      DATA (CF(I, 51),I=  1,  6) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 51),I=  7, 12) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 51),I= 13, 18) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 51),I= 19, 24) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 51),I= 25, 30) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 51),I= 31, 36) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 51),I= 37, 42) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 51),I= 43, 48) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 51),I= 49, 54) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 51),I= 55, 60) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 51),I= 61, 66) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 51),I= 67, 72) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 51),I= 73, 78) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 51),I= 79, 84) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 51),I= 85, 90) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 51),I= 91, 96) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 51),I= 97,102) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 51),I=103,108) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 51),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 51),I=115,120) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
+      DATA (CF(I),I=4776,4845) /4096,-1024,1280,128,128,-16,1280,-160
+     $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992
+     $ ,-160,20,1280,-160,200,20,992,-124,1136,-142,1028,1010,128,-16,
+     $ -16,2,2,20,-106,-124,-88,-268,-142,38,-106,-88,200,20,1028,-124
+     $ ,-88,-232,1028,-88,272,-232,20,2,-124,56,20,-124,-88,56,-232
+     $ ,884,-124,-106/
 C     1 T(5,1,6,2,7,3,4)
-      DATA (CF(I, 52),I=  1,  6) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 52),I=  7, 12) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 52),I= 13, 18) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 52),I= 19, 24) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 52),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 52),I= 31, 36) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 52),I= 37, 42) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 52),I= 43, 48) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 52),I= 49, 54) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 52),I= 55, 60) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 52),I= 61, 66) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 52),I= 67, 72) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 52),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 52),I= 79, 84) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 52),I= 85, 90) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 52),I= 91, 96) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 52),I= 97,102) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 52),I=103,108) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 52),I=109,114) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 52),I=115,120) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=4846,4914) /4096,128,-1024,-16,-160,-160,992,992,
+     $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,20
+     $ ,200,-160,1280,20,-160,-124,-106,-142,38,-88,-268,-16,128,2,20,
+     $ -16,2,-124,992,1028,1010,1136,-142,38,-106,20,-160,-124,992,
+     $ -268,884,1010,-268,-232,884,2,-16,-142,-124,-160,20,56,-448,-88
+     $ ,-268,992,-124/
 C     1 T(5,1,6,7,2,3,4)
-      DATA (CF(I, 53),I=  1,  6) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 53),I=  7, 12) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 53),I= 13, 18) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 53),I= 19, 24) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 53),I= 25, 30) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 53),I= 31, 36) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 53),I= 37, 42) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 53),I= 43, 48) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 53),I= 49, 54) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 53),I= 55, 60) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 53),I= 61, 66) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 53),I= 67, 72) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 53),I= 73, 78) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 53),I= 79, 84) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 53),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 53),I= 91, 96) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 53),I= 97,102) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 53),I=103,108) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 53),I=109,114) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 53),I=115,120) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
+      DATA (CF(I),I=4915,4982) /4096,-1024,-16,128,1136,992,1280,-160,
+     $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,-106,-88
+     $ ,200,20,1028,-124,-88,-232,1028,-88,272,-232,20,2,-124,56,20,
+     $ -124,-88,56,-232,884,-124,-106,-160,20,1280,-160,200,20,992,
+     $ -124,1136,-142,1028,1010,128,-16,-16,2,2,20,-106,-124,-88,-268,
+     $ -142,38/
 C     1 T(5,1,7,2,6,3,4)
-      DATA (CF(I, 54),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 54),I=  7, 12) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 54),I= 13, 18) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 54),I= 19, 24) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 54),I= 25, 30) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 54),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 54),I= 37, 42) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 54),I= 43, 48) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 54),I= 49, 54) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 54),I= 55, 60) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 54),I= 61, 66) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 54),I= 67, 72) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 54),I= 73, 78) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 54),I= 79, 84) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 54),I= 85, 90) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 54),I= 91, 96) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 54),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 54),I=103,108) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 54),I=109,114) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 54),I=115,120) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
+      DATA (CF(I),I=4983,5049) /4096,-160,-16,992,-448,-160,992,-16
+     $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,38,-106,20,
+     $ -160,-124,992,-268,884,1010,-268,-232,884,2,-16,-142,-124,-160
+     $ ,20,56,-448,-88,-268,992,-124,20,200,-160,1280,20,-160,-124,
+     $ -106,-142,38,-88,-268,-16,128,2,20,-16,2,-124,992,1028,1010
+     $ ,1136,-142/
 C     1 T(5,1,7,6,2,3,4)
-      DATA (CF(I, 55),I=  1,  6) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 55),I=  7, 12) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 55),I= 13, 18) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 55),I= 19, 24) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 55),I= 25, 30) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 55),I= 31, 36) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 55),I= 37, 42) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 55),I= 43, 48) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 55),I= 49, 54) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 55),I= 55, 60) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 55),I= 61, 66) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 55),I= 67, 72) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 55),I= 73, 78) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 55),I= 79, 84) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 55),I= 85, 90) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 55),I= 91, 96) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 55),I= 97,102) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 55),I=103,108) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 55),I=109,114) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 55),I=115,120) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
+      DATA (CF(I),I=5050,5115) /4096,-1024,-1024,128,128,1280,1280,
+     $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,-448,56,992,
+     $ -124,-88,-268,992,-124,-160,20,-106,38,-160,20,-16,2,-124,-142,
+     $ -232,884,884,-268,-268,1010,56,-88,-124,-106,-232,884,-124,1028
+     $ ,20,200,-88,-106,20,-124,2,20,56,-124,272,-232,-232,-88,-88
+     $ ,1028/
 C     1 T(5,2,1,6,7,3,4)
-      DATA (CF(I, 56),I=  1,  6) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 56),I=  7, 12) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 56),I= 13, 18) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 56),I= 19, 24) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 56),I= 25, 30) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 56),I= 31, 36) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 56),I= 37, 42) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 56),I= 43, 48) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 56),I= 49, 54) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 56),I= 55, 60) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 56),I= 61, 66) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 56),I= 67, 72) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 56),I= 73, 78) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 56),I= 79, 84) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 56),I= 85, 90) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 56),I= 91, 96) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 56),I= 97,102) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 56),I=103,108) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 56),I=109,114) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 56),I=115,120) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
+      DATA (CF(I),I=5116,5180) /4096,128,1280,-1024,128,-160,992,-16,
+     $ -160,-448,992,1280,-160,128,-16,992,1136,56,-88,-124,-106,-232
+     $ ,884,-124,1028,20,200,-88,-106,20,-124,2,20,56,-124,272,-232,
+     $ -232,-88,-88,1028,-448,56,992,-124,-88,-268,992,-124,-160,20,
+     $ -106,38,-160,20,-16,2,-124,-142,-232,884,884,-268,-268,1010/
 C     1 T(5,2,1,7,6,3,4)
-      DATA (CF(I, 57),I=  1,  6) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 57),I=  7, 12) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 57),I= 13, 18) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 57),I= 19, 24) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 57),I= 25, 30) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 57),I= 31, 36) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 57),I= 37, 42) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 57),I= 43, 48) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 57),I= 49, 54) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 57),I= 55, 60) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 57),I= 61, 66) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 57),I= 67, 72) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 57),I= 73, 78) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 57),I= 79, 84) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 57),I= 85, 90) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 57),I= 91, 96) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 57),I= 97,102) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 57),I=103,108) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 57),I=109,114) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 57),I=115,120) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=5181,5244) /4096,-1024,1280,128,128,-16,-1024,128,
+     $ -160,-16,992,-448,-160,-16,992,-160,992,-124,1136,-142,1028
+     $ ,1010,-160,20,1280,-160,200,20,-16,2,128,-16,20,2,-88,-268,-106
+     $ ,-124,38,-142,-88,-232,1028,-88,272,-232,-106,-88,200,20,1028,
+     $ -124,-124,56,20,2,-124,20,-232,884,-88,56,-106,-124/
 C     1 T(5,2,6,1,7,3,4)
-      DATA (CF(I, 58),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 58),I=  7, 12) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 58),I= 13, 18) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 58),I= 19, 24) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 58),I= 25, 30) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 58),I= 31, 36) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 58),I= 37, 42) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 58),I= 43, 48) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 58),I= 49, 54) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 58),I= 55, 60) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 58),I= 61, 66) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 58),I= 67, 72) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 58),I= 73, 78) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 58),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 58),I= 85, 90) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 58),I= 91, 96) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 58),I= 97,102) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 58),I=103,108) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 58),I=109,114) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 58),I=115,120) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
+      DATA (CF(I),I=5245,5307) /4096,128,-1024,-16,-160,128,-1024,-16
+     $ ,128,1136,992,-16,128,-160,1280,-124,-106,-142,38,-88,-268,20
+     $ ,200,-160,1280,20,-160,2,20,-16,128,2,-16,1028,1010,-124,992,
+     $ -142,1136,-268,884,1010,-268,-232,884,38,-106,20,-160,-124,992,
+     $ -142,-124,2,-16,20,-160,-88,-268,56,-448,-124,992/
 C     1 T(5,2,6,7,1,3,4)
-      DATA (CF(I, 59),I=  1,  6) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 59),I=  7, 12) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 59),I= 13, 18) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 59),I= 19, 24) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 59),I= 25, 30) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 59),I= 31, 36) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 59),I= 37, 42) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 59),I= 43, 48) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 59),I= 49, 54) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 59),I= 55, 60) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 59),I= 61, 66) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 59),I= 67, 72) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 59),I= 73, 78) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 59),I= 79, 84) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 59),I= 85, 90) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 59),I= 91, 96) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 59),I= 97,102) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 59),I=103,108) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 59),I=109,114) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 59),I=115,120) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
+      DATA (CF(I),I=5308,5369) /4096,-1024,992,-448,-160,-16,992,-160
+     $ ,128,-16,-1024,128,-160,-16,-88,-232,1028,-88,272,-232,-106,-88
+     $ ,200,20,1028,-124,-124,56,20,2,-124,20,-232,884,-88,56,-106,
+     $ -124,992,-124,1136,-142,1028,1010,-160,20,1280,-160,200,20,-16
+     $ ,2,128,-16,20,2,-88,-268,-106,-124,38,-142/
 C     1 T(5,2,7,1,6,3,4)
-      DATA (CF(I, 60),I=  1,  6) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 60),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 60),I= 13, 18) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 60),I= 19, 24) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 60),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 60),I= 31, 36) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 60),I= 37, 42) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 60),I= 43, 48) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 60),I= 49, 54) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 60),I= 55, 60) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 60),I= 61, 66) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 60),I= 67, 72) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 60),I= 73, 78) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 60),I= 79, 84) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 60),I= 85, 90) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 60),I= 91, 96) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 60),I= 97,102) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 60),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 60),I=109,114) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 60),I=115,120) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
+      DATA (CF(I),I=5370,5430) /4096,1136,992,-16,128,-160,1280,-16,
+     $ -160,128,-1024,-16,128,-268,884,1010,-268,-232,884,38,-106,20,
+     $ -160,-124,992,-142,-124,2,-16,20,-160,-88,-268,56,-448,-124,992
+     $ ,-124,-106,-142,38,-88,-268,20,200,-160,1280,20,-160,2,20,-16
+     $ ,128,2,-16,1028,1010,-124,992,-142,1136/
 C     1 T(5,2,7,6,1,3,4)
-      DATA (CF(I, 61),I=  1,  6) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 61),I=  7, 12) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 61),I= 13, 18) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 61),I= 19, 24) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 61),I= 25, 30) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 61),I= 31, 36) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 61),I= 37, 42) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 61),I= 43, 48) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 61),I= 49, 54) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 61),I= 55, 60) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 61),I= 61, 66) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 61),I= 67, 72) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 61),I= 73, 78) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 61),I= 79, 84) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 61),I= 85, 90) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 61),I= 91, 96) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 61),I= 97,102) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 61),I=103,108) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 61),I=109,114) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 61),I=115,120) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
+      DATA (CF(I),I=5431,5490) /4096,-1024,-1024,128,128,1280,992,-160
+     $ ,-448,992,-16,-160,-16,2,128,-16,20,2,-160,20,-16,2,-124,-142,
+     $ -1024,128,128,-16,-16,-160,-124,20,56,-124,2,20,884,-232,-106,
+     $ -124,-88,56,-232,272,-88,1028,-232,-88,-124,20,56,-124,2,20
+     $ ,1028,-124,-88,-106,20,200/
 C     1 T(5,6,1,2,7,3,4)
-      DATA (CF(I, 62),I=  1,  6) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 62),I=  7, 12) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 62),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 62),I= 19, 24) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 62),I= 25, 30) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 62),I= 31, 36) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 62),I= 37, 42) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 62),I= 43, 48) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 62),I= 49, 54) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 62),I= 55, 60) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 62),I= 61, 66) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 62),I= 67, 72) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 62),I= 73, 78) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 62),I= 79, 84) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 62),I= 85, 90) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 62),I= 91, 96) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 62),I= 97,102) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 62),I=103,108) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 62),I=109,114) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 62),I=115,120) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
+      DATA (CF(I),I=5491,5549) /4096,128,1280,-1024,128,-160,1280,992
+     $ ,1136,128,-16,2,20,-16,128,2,-16,20,-124,2,20,56,-124,128,-1024
+     $ ,-16,-160,128,-16,20,-160,-124,-142,-16,2,-268,-88,-124,992,56,
+     $ -448,884,-232,-268,1010,884,-268,20,-160,-124,-142,-16,2,-124
+     $ ,992,-106,38,-160,20/
 C     1 T(5,6,1,7,2,3,4)
-      DATA (CF(I, 63),I=  1,  6) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 63),I=  7, 12) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 63),I= 13, 18) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 63),I= 19, 24) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 63),I= 25, 30) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 63),I= 31, 36) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 63),I= 37, 42) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 63),I= 43, 48) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 63),I= 49, 54) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 63),I= 55, 60) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 63),I= 61, 66) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 63),I= 67, 72) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 63),I= 73, 78) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 63),I= 79, 84) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 63),I= 85, 90) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 63),I= 91, 96) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 63),I= 97,102) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 63),I=103,108) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 63),I=109,114) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 63),I=115,120) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=5550,5607) /4096,-1024,1280,128,-448,992,992,-160,
+     $ -160,-16,-160,20,-16,2,-124,-142,-16,2,128,-16,20,2,128,-16,
+     $ -1024,128,-160,-16,56,-124,-124,20,20,2,-232,272,-88,1028,-232,
+     $ -88,884,-232,-106,-124,-88,56,56,-124,-124,20,20,2,-88,-106
+     $ ,1028,-124,200,20/
 C     1 T(5,6,2,1,7,3,4)
-      DATA (CF(I, 64),I=  1,  6) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 64),I=  7, 12) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 64),I= 13, 18) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 64),I= 19, 24) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 64),I= 25, 30) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 64),I= 31, 36) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 64),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 64),I= 43, 48) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 64),I= 49, 54) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 64),I= 55, 60) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 64),I= 61, 66) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 64),I= 67, 72) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 64),I= 73, 78) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 64),I= 79, 84) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 64),I= 85, 90) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 64),I= 91, 96) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 64),I= 97,102) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 64),I=103,108) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 64),I=109,114) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 64),I=115,120) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=5608,5664) /4096,128,-1024,992,1136,-160,1280,-16
+     $ ,128,20,-124,2,20,56,-124,2,20,-16,128,2,-16,-16,-160,128,-1024
+     $ ,-16,128,-124,-142,20,-160,2,-16,884,-232,-268,1010,884,-268,
+     $ -268,-88,-124,992,56,-448,-124,-142,20,-160,2,-16,-106,38,-124
+     $ ,992,20,-160/
 C     1 T(5,6,2,7,1,3,4)
-      DATA (CF(I, 65),I=  1,  6) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 65),I=  7, 12) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 65),I= 13, 18) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 65),I= 19, 24) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 65),I= 25, 30) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 65),I= 31, 36) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 65),I= 37, 42) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 65),I= 43, 48) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 65),I= 49, 54) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 65),I= 55, 60) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 65),I= 61, 66) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 65),I= 67, 72) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 65),I= 73, 78) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 65),I= 79, 84) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 65),I= 85, 90) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 65),I= 91, 96) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 65),I= 97,102) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 65),I=103,108) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 65),I=109,114) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 65),I=115,120) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
+      DATA (CF(I),I=5665,5720) /4096,-1024,-16,128,-160,-16,-1024,128,
+     $ -142,-124,2,-16,20,-160,-124,56,20,2,-124,20,-16,128,-160,-16,
+     $ -1024,128,2,-16,20,2,128,-16,1010,1028,-142,1136,-124,992,-268,
+     $ -88,38,-142,-106,-124,2,-16,20,2,128,-16,20,-160,200,20,1280,
+     $ -160/
 C     1 T(5,6,7,1,2,3,4)
-      DATA (CF(I, 66),I=  1,  6) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 66),I=  7, 12) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 66),I= 13, 18) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 66),I= 19, 24) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 66),I= 25, 30) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 66),I= 31, 36) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 66),I= 37, 42) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 66),I= 43, 48) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 66),I= 49, 54) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 66),I= 55, 60) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 66),I= 61, 66) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 66),I= 67, 72) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 66),I= 73, 78) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 66),I= 79, 84) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 66),I= 85, 90) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 66),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 66),I= 97,102) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 66),I=103,108) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 66),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 66),I=115,120) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
+      DATA (CF(I),I=5721,5775) /4096,-160,-16,-16,128,128,-1024,-124
+     $ ,56,20,2,-124,20,-142,-124,2,-16,20,-160,-160,-16,-16,128,128,
+     $ -1024,20,2,2,-16,-16,128,-268,-88,38,-142,-106,-124,1010,1028,
+     $ -142,1136,-124,992,20,2,2,-16,-16,128,200,20,20,-160,-160,1280/
 C     1 T(5,6,7,2,1,3,4)
-      DATA (CF(I, 67),I=  1,  6) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 67),I=  7, 12) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 67),I= 13, 18) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 67),I= 19, 24) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 67),I= 25, 30) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 67),I= 31, 36) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 67),I= 37, 42) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 67),I= 43, 48) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 67),I= 49, 54) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 67),I= 55, 60) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 67),I= 61, 66) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 67),I= 67, 72) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 67),I= 73, 78) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 67),I= 79, 84) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 67),I= 85, 90) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 67),I= 91, 96) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 67),I= 97,102) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 67),I=103,108) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 67),I=109,114) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 67),I=115,120) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=5776,5829) /4096,-1024,-1024,128,128,1280,884,-232
+     $ ,-106,-124,-88,56,-232,272,-88,1028,-232,-88,-124,20,56,-124,2
+     $ ,20,1028,-124,-88,-106,20,200,-16,2,128,-16,20,2,-160,20,-16,2,
+     $ -124,-142,-1024,128,128,-16,-16,-160,-124,20,56,-124,2,20/
 C     1 T(5,7,1,2,6,3,4)
-      DATA (CF(I, 68),I=  1,  6) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 68),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 68),I= 13, 18) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 68),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 68),I= 25, 30) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 68),I= 31, 36) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 68),I= 37, 42) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 68),I= 43, 48) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 68),I= 49, 54) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 68),I= 55, 60) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 68),I= 61, 66) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 68),I= 67, 72) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 68),I= 73, 78) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 68),I= 79, 84) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 68),I= 85, 90) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 68),I= 91, 96) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 68),I= 97,102) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 68),I=103,108) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 68),I=109,114) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 68),I=115,120) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
+      DATA (CF(I),I=5830,5882) /4096,128,1280,-1024,128,-268,-88,-124
+     $ ,992,56,-448,884,-232,-268,1010,884,-268,20,-160,-124,-142,-16
+     $ ,2,-124,992,-106,38,-160,20,2,20,-16,128,2,-16,20,-124,2,20,56,
+     $ -124,128,-1024,-16,-160,128,-16,20,-160,-124,-142,-16,2/
 C     1 T(5,7,1,6,2,3,4)
-      DATA (CF(I, 69),I=  1,  6) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 69),I=  7, 12) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 69),I= 13, 18) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 69),I= 19, 24) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 69),I= 25, 30) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 69),I= 31, 36) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 69),I= 37, 42) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 69),I= 43, 48) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 69),I= 49, 54) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 69),I= 55, 60) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 69),I= 61, 66) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 69),I= 67, 72) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 69),I= 73, 78) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 69),I= 79, 84) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 69),I= 85, 90) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 69),I= 91, 96) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 69),I= 97,102) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 69),I=103,108) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 69),I=109,114) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 69),I=115,120) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
+      DATA (CF(I),I=5883,5934) /4096,-1024,1280,128,-232,272,-88,1028,
+     $ -232,-88,884,-232,-106,-124,-88,56,56,-124,-124,20,20,2,-88,
+     $ -106,1028,-124,200,20,-160,20,-16,2,-124,-142,-16,2,128,-16,20
+     $ ,2,128,-16,-1024,128,-160,-16,56,-124,-124,20,20,2/
 C     1 T(5,7,2,1,6,3,4)
-      DATA (CF(I, 70),I=  1,  6) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 70),I=  7, 12) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 70),I= 13, 18) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 70),I= 19, 24) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 70),I= 25, 30) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 70),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 70),I= 37, 42) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 70),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 70),I= 49, 54) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 70),I= 55, 60) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 70),I= 61, 66) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 70),I= 67, 72) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 70),I= 73, 78) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 70),I= 79, 84) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 70),I= 85, 90) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 70),I= 91, 96) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 70),I= 97,102) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 70),I=103,108) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 70),I=109,114) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 70),I=115,120) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=5935,5985) /4096,128,-1024,884,-232,-268,1010,884,
+     $ -268,-268,-88,-124,992,56,-448,-124,-142,20,-160,2,-16,-106,38,
+     $ -124,992,20,-160,20,-124,2,20,56,-124,2,20,-16,128,2,-16,-16,
+     $ -160,128,-1024,-16,128,-124,-142,20,-160,2,-16/
 C     1 T(5,7,2,6,1,3,4)
-      DATA (CF(I, 71),I=  1,  6) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 71),I=  7, 12) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 71),I= 13, 18) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 71),I= 19, 24) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 71),I= 25, 30) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 71),I= 31, 36) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 71),I= 37, 42) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 71),I= 43, 48) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 71),I= 49, 54) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 71),I= 55, 60) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 71),I= 61, 66) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 71),I= 67, 72) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 71),I= 73, 78) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 71),I= 79, 84) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 71),I= 85, 90) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 71),I= 91, 96) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 71),I= 97,102) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 71),I=103,108) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 71),I=109,114) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 71),I=115,120) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
+      DATA (CF(I),I=5986,6035) /4096,-1024,1010,1028,-142,1136,-124
+     $ ,992,-268,-88,38,-142,-106,-124,2,-16,20,2,128,-16,20,-160,200
+     $ ,20,1280,-160,-142,-124,2,-16,20,-160,-124,56,20,2,-124,20,-16
+     $ ,128,-160,-16,-1024,128,2,-16,20,2,128,-16/
 C     1 T(5,7,6,1,2,3,4)
-      DATA (CF(I, 72),I=  1,  6) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 72),I=  7, 12) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 72),I= 13, 18) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 72),I= 19, 24) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 72),I= 25, 30) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 72),I= 31, 36) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 72),I= 37, 42) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 72),I= 43, 48) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 72),I= 49, 54) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 72),I= 55, 60) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 72),I= 61, 66) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 72),I= 67, 72) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 72),I= 73, 78) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 72),I= 79, 84) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 72),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 72),I= 91, 96) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 72),I= 97,102) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 72),I=103,108) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 72),I=109,114) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 72),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=6036,6084) /4096,-268,-88,38,-142,-106,-124,1010
+     $ ,1028,-142,1136,-124,992,20,2,2,-16,-16,128,200,20,20,-160,-160
+     $ ,1280,-124,56,20,2,-124,20,-142,-124,2,-16,20,-160,-160,-16,-16
+     $ ,128,128,-1024,20,2,2,-16,-16,128/
 C     1 T(5,7,6,2,1,3,4)
-      DATA (CF(I, 73),I=  1,  6) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 73),I=  7, 12) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 73),I= 13, 18) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 73),I= 19, 24) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 73),I= 25, 30) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 73),I= 31, 36) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 73),I= 37, 42) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 73),I= 43, 48) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 73),I= 49, 54) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 73),I= 55, 60) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 73),I= 61, 66) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 73),I= 67, 72) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 73),I= 73, 78) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 73),I= 79, 84) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 73),I= 85, 90) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 73),I= 91, 96) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 73),I= 97,102) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 73),I=103,108) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 73),I=109,114) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 73),I=115,120) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
+      DATA (CF(I),I=6085,6132) /4096,-1024,-1024,128,128,1280,-1024
+     $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160
+     $ ,992,992,-448,1028,-124,-88,-106,20,200,-88,56,-232,884,-124,
+     $ -106,-232,-88,272,-232,1028,-88,2,20,20,-124,-124,56/
 C     1 T(6,1,2,5,7,3,4)
-      DATA (CF(I, 74),I=  1,  6) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 74),I=  7, 12) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 74),I= 13, 18) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 74),I= 19, 24) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 74),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 74),I= 31, 36) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 74),I= 37, 42) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 74),I= 43, 48) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 74),I= 49, 54) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 74),I= 55, 60) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 74),I= 61, 66) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 74),I= 67, 72) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 74),I= 73, 78) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 74),I= 79, 84) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 74),I= 85, 90) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 74),I= 91, 96) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 74),I= 97,102) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 74),I=103,108) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 74),I=109,114) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 74),I=115,120) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=6133,6179) /4096,128,1280,-1024,128,128,-1024,-16,
+     $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136
+     $ ,992,-124,992,-106,38,-160,20,56,-448,-88,-268,992,-124,884,
+     $ -268,-232,884,1010,-268,-16,2,-160,20,-142,-124/
 C     1 T(6,1,2,7,5,3,4)
-      DATA (CF(I, 75),I=  1,  6) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 75),I=  7, 12) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 75),I= 13, 18) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 75),I= 19, 24) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 75),I= 25, 30) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 75),I= 31, 36) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 75),I= 37, 42) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 75),I= 43, 48) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 75),I= 49, 54) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 75),I= 55, 60) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 75),I= 61, 66) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 75),I= 67, 72) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 75),I= 73, 78) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 75),I= 79, 84) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 75),I= 85, 90) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 75),I= 91, 96) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 75),I= 97,102) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 75),I=103,108) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 75),I=109,114) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 75),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=6180,6225) /4096,-1024,1280,128,128,-16,1280,-160
+     $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992
+     $ ,-88,-106,1028,-124,200,20,-232,-88,272,-232,1028,-88,-88,56,
+     $ -232,884,-124,-106,20,2,-124,56,20,-124/
 C     1 T(6,1,5,2,7,3,4)
-      DATA (CF(I, 76),I=  1,  6) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 76),I=  7, 12) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 76),I= 13, 18) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 76),I= 19, 24) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 76),I= 25, 30) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 76),I= 31, 36) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 76),I= 37, 42) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 76),I= 43, 48) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 76),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 76),I= 55, 60) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 76),I= 61, 66) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 76),I= 67, 72) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 76),I= 73, 78) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 76),I= 79, 84) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 76),I= 85, 90) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 76),I= 91, 96) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 76),I= 97,102) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 76),I=103,108) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 76),I=109,114) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 76),I=115,120) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
+      DATA (CF(I),I=6226,6270) /4096,128,-1024,-16,-160,-160,992,992,
+     $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,-106
+     $ ,38,-124,992,20,-160,884,-268,-232,884,1010,-268,56,-448,-88,
+     $ -268,992,-124,2,-16,-142,-124,-160,20/
 C     1 T(6,1,5,7,2,3,4)
-      DATA (CF(I, 77),I=  1,  6) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 77),I=  7, 12) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 77),I= 13, 18) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 77),I= 19, 24) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 77),I= 25, 30) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 77),I= 31, 36) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 77),I= 37, 42) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 77),I= 43, 48) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 77),I= 49, 54) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 77),I= 55, 60) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 77),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 77),I= 67, 72) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 77),I= 73, 78) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 77),I= 79, 84) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 77),I= 85, 90) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 77),I= 91, 96) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 77),I= 97,102) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 77),I=103,108) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 77),I=109,114) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 77),I=115,120) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=6271,6314) /4096,-1024,-16,128,1136,992,1280,-160,
+     $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,20,-160
+     $ ,200,20,1280,-160,-124,992,1028,1010,1136,-142,-106,-124,-88,
+     $ -268,-142,38,128,-16,-16,2,2,20/
 C     1 T(6,1,7,2,5,3,4)
-      DATA (CF(I, 78),I=  1,  6) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 78),I=  7, 12) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 78),I= 13, 18) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 78),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 78),I= 25, 30) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 78),I= 31, 36) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 78),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 78),I= 43, 48) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 78),I= 49, 54) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 78),I= 55, 60) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 78),I= 61, 66) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 78),I= 67, 72) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 78),I= 73, 78) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 78),I= 79, 84) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 78),I= 85, 90) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 78),I= 91, 96) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 78),I= 97,102) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 78),I=103,108) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 78),I=109,114) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 78),I=115,120) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
+      DATA (CF(I),I=6315,6357) /4096,-160,-16,992,-448,-160,992,-16
+     $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,200,20,20,
+     $ -160,-160,1280,-106,-124,-88,-268,-142,38,-124,992,1028,1010
+     $ ,1136,-142,-16,128,2,20,-16,2/
 C     1 T(6,1,7,5,2,3,4)
-      DATA (CF(I, 79),I=  1,  6) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 79),I=  7, 12) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 79),I= 13, 18) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 79),I= 19, 24) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 79),I= 25, 30) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 79),I= 31, 36) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 79),I= 37, 42) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 79),I= 43, 48) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 79),I= 49, 54) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 79),I= 55, 60) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 79),I= 61, 66) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 79),I= 67, 72) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 79),I= 73, 78) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 79),I= 79, 84) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 79),I= 85, 90) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 79),I= 91, 96) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 79),I= 97,102) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 79),I=103,108) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 79),I=109,114) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 79),I=115,120) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
+      DATA (CF(I),I=6358,6399) /4096,-1024,-1024,128,128,1280,1280,
+     $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,-88,56,-232
+     $ ,884,-124,-106,1028,-124,-88,-106,20,200,272,-232,-232,-88,-88
+     $ ,1028,20,-124,2,20,56,-124/
 C     1 T(6,2,1,5,7,3,4)
-      DATA (CF(I, 80),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 80),I=  7, 12) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 80),I= 13, 18) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 80),I= 19, 24) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 80),I= 25, 30) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 80),I= 31, 36) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 80),I= 37, 42) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 80),I= 43, 48) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 80),I= 49, 54) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 80),I= 55, 60) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 80),I= 61, 66) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 80),I= 67, 72) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 80),I= 73, 78) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 80),I= 79, 84) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 80),I= 85, 90) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 80),I= 91, 96) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 80),I= 97,102) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 80),I=103,108) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 80),I=109,114) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 80),I=115,120) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
+      DATA (CF(I),I=6400,6440) /4096,128,1280,-1024,128,-160,992,-16,
+     $ -160,-448,992,1280,-160,128,-16,992,1136,56,-448,-88,-268,992,
+     $ -124,-124,992,-106,38,-160,20,-232,884,884,-268,-268,1010,-160
+     $ ,20,-16,2,-124,-142/
 C     1 T(6,2,1,7,5,3,4)
-      DATA (CF(I, 81),I=  1,  6) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 81),I=  7, 12) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 81),I= 13, 18) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 81),I= 19, 24) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 81),I= 25, 30) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 81),I= 31, 36) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 81),I= 37, 42) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 81),I= 43, 48) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 81),I= 49, 54) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 81),I= 55, 60) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 81),I= 61, 66) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 81),I= 67, 72) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 81),I= 73, 78) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 81),I= 79, 84) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 81),I= 85, 90) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 81),I= 91, 96) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 81),I= 97,102) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 81),I=103,108) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 81),I=109,114) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 81),I=115,120) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
+      DATA (CF(I),I=6441,6480) /4096,-1024,1280,128,128,-16,-1024,128,
+     $ -160,-16,992,-448,-160,-16,992,-160,-232,-88,272,-232,1028,-88,
+     $ -88,-106,1028,-124,200,20,-232,884,-88,56,-106,-124,-124,56,20
+     $ ,2,-124,20/
 C     1 T(6,2,5,1,7,3,4)
-      DATA (CF(I, 82),I=  1,  6) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 82),I=  7, 12) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 82),I= 13, 18) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 82),I= 19, 24) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 82),I= 25, 30) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 82),I= 31, 36) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 82),I= 37, 42) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 82),I= 43, 48) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 82),I= 49, 54) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 82),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 82),I= 61, 66) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 82),I= 67, 72) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 82),I= 73, 78) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 82),I= 79, 84) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 82),I= 85, 90) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 82),I= 91, 96) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 82),I= 97,102) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 82),I=103,108) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 82),I=109,114) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 82),I=115,120) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=6481,6519) /4096,128,-1024,-16,-160,128,-1024,-16
+     $ ,128,1136,992,-16,128,-160,1280,884,-268,-232,884,1010,-268,
+     $ -106,38,-124,992,20,-160,-88,-268,56,-448,-124,992,-142,-124,2,
+     $ -16,20,-160/
 C     1 T(6,2,5,7,1,3,4)
-      DATA (CF(I, 83),I=  1,  6) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 83),I=  7, 12) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 83),I= 13, 18) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 83),I= 19, 24) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 83),I= 25, 30) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 83),I= 31, 36) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 83),I= 37, 42) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 83),I= 43, 48) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 83),I= 49, 54) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 83),I= 55, 60) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 83),I= 61, 66) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 83),I= 67, 72) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 83),I= 73, 78) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 83),I= 79, 84) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 83),I= 85, 90) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 83),I= 91, 96) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 83),I= 97,102) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 83),I=103,108) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 83),I=109,114) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 83),I=115,120) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
+      DATA (CF(I),I=6520,6557) /4096,-1024,992,-448,-160,-16,992,-160
+     $ ,128,-16,-1024,128,-160,-16,-124,992,1028,1010,1136,-142,20,
+     $ -160,200,20,1280,-160,-88,-268,-106,-124,38,-142,-16,2,128,-16
+     $ ,20,2/
 C     1 T(6,2,7,1,5,3,4)
-      DATA (CF(I, 84),I=  1,  6) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 84),I=  7, 12) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 84),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 84),I= 19, 24) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 84),I= 25, 30) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 84),I= 31, 36) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 84),I= 37, 42) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 84),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 84),I= 49, 54) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 84),I= 55, 60) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 84),I= 61, 66) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 84),I= 67, 72) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 84),I= 73, 78) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 84),I= 79, 84) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 84),I= 85, 90) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 84),I= 91, 96) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 84),I= 97,102) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 84),I=103,108) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 84),I=109,114) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 84),I=115,120) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=6558,6594) /4096,1136,992,-16,128,-160,1280,-16,
+     $ -160,128,-1024,-16,128,-106,-124,-88,-268,-142,38,200,20,20,
+     $ -160,-160,1280,1028,1010,-124,992,-142,1136,2,20,-16,128,2,-16/
 C     1 T(6,2,7,5,1,3,4)
-      DATA (CF(I, 85),I=  1,  6) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 85),I=  7, 12) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 85),I= 13, 18) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 85),I= 19, 24) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 85),I= 25, 30) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 85),I= 31, 36) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 85),I= 37, 42) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 85),I= 43, 48) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 85),I= 49, 54) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 85),I= 55, 60) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 85),I= 61, 66) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 85),I= 67, 72) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 85),I= 73, 78) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 85),I= 79, 84) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 85),I= 85, 90) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 85),I= 91, 96) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 85),I= 97,102) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 85),I=103,108) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 85),I=109,114) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 85),I=115,120) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=6595,6630) /4096,-1024,-1024,128,128,1280,992,-160
+     $ ,-448,992,-16,-160,-232,884,-88,56,-106,-124,272,-232,-232,-88,
+     $ -88,1028,1028,-124,-88,-106,20,200,-124,20,56,-124,2,20/
 C     1 T(6,5,1,2,7,3,4)
-      DATA (CF(I, 86),I=  1,  6) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 86),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 86),I= 13, 18) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 86),I= 19, 24) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 86),I= 25, 30) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 86),I= 31, 36) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 86),I= 37, 42) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 86),I= 43, 48) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 86),I= 49, 54) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 86),I= 55, 60) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 86),I= 61, 66) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 86),I= 67, 72) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 86),I= 73, 78) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 86),I= 79, 84) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 86),I= 85, 90) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 86),I= 91, 96) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 86),I= 97,102) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 86),I=103,108) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 86),I=109,114) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 86),I=115,120) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
+      DATA (CF(I),I=6631,6665) /4096,128,1280,-1024,128,-160,1280,992
+     $ ,1136,128,-16,-88,-268,56,-448,-124,992,-232,884,884,-268,-268
+     $ ,1010,-124,992,-106,38,-160,20,20,-160,-124,-142,-16,2/
 C     1 T(6,5,1,7,2,3,4)
-      DATA (CF(I, 87),I=  1,  6) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 87),I=  7, 12) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 87),I= 13, 18) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 87),I= 19, 24) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 87),I= 25, 30) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 87),I= 31, 36) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 87),I= 37, 42) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 87),I= 43, 48) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 87),I= 49, 54) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 87),I= 55, 60) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 87),I= 61, 66) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 87),I= 67, 72) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 87),I= 73, 78) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 87),I= 79, 84) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 87),I= 85, 90) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 87),I= 91, 96) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 87),I= 97,102) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 87),I=103,108) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 87),I=109,114) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 87),I=115,120) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
+      DATA (CF(I),I=6666,6699) /4096,-1024,1280,128,-448,992,992,-160,
+     $ -160,-16,272,-232,-232,-88,-88,1028,-232,884,-88,56,-106,-124,
+     $ -88,-106,1028,-124,200,20,56,-124,-124,20,20,2/
 C     1 T(6,5,2,1,7,3,4)
-      DATA (CF(I, 88),I=  1,  6) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 88),I=  7, 12) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 88),I= 13, 18) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 88),I= 19, 24) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 88),I= 25, 30) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 88),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 88),I= 37, 42) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 88),I= 43, 48) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 88),I= 49, 54) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 88),I= 55, 60) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 88),I= 61, 66) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 88),I= 67, 72) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 88),I= 73, 78) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 88),I= 79, 84) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 88),I= 85, 90) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 88),I= 91, 96) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 88),I= 97,102) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 88),I=103,108) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 88),I=109,114) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 88),I=115,120) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=6700,6732) /4096,128,-1024,992,1136,-160,1280,-16
+     $ ,128,-232,884,884,-268,-268,1010,-88,-268,56,-448,-124,992,-106
+     $ ,38,-124,992,20,-160,-124,-142,20,-160,2,-16/
 C     1 T(6,5,2,7,1,3,4)
-      DATA (CF(I, 89),I=  1,  6) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 89),I=  7, 12) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 89),I= 13, 18) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 89),I= 19, 24) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 89),I= 25, 30) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 89),I= 31, 36) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 89),I= 37, 42) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 89),I= 43, 48) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 89),I= 49, 54) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 89),I= 55, 60) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 89),I= 61, 66) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 89),I= 67, 72) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 89),I= 73, 78) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 89),I= 79, 84) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 89),I= 85, 90) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 89),I= 91, 96) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 89),I= 97,102) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 89),I=103,108) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 89),I=109,114) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 89),I=115,120) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
+      DATA (CF(I),I=6733,6764) /4096,-1024,-16,128,-160,-16,-1024,128
+     $ ,1028,1010,-124,992,-142,1136,-88,-268,-106,-124,38,-142,20,
+     $ -160,200,20,1280,-160,2,-16,20,2,128,-16/
 C     1 T(6,5,7,1,2,3,4)
-      DATA (CF(I, 90),I=  1,  6) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 90),I=  7, 12) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 90),I= 13, 18) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 90),I= 19, 24) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 90),I= 25, 30) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 90),I= 31, 36) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 90),I= 37, 42) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 90),I= 43, 48) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 90),I= 49, 54) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 90),I= 55, 60) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 90),I= 61, 66) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 90),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 90),I= 73, 78) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 90),I= 79, 84) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 90),I= 85, 90) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 90),I= 91, 96) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 90),I= 97,102) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 90),I=103,108) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 90),I=109,114) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 90),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=6765,6795) /4096,-160,-16,-16,128,128,-1024,-88,
+     $ -268,-106,-124,38,-142,1028,1010,-124,992,-142,1136,200,20,20,
+     $ -160,-160,1280,20,2,2,-16,-16,128/
 C     1 T(6,5,7,2,1,3,4)
-      DATA (CF(I, 91),I=  1,  6) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 91),I=  7, 12) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 91),I= 13, 18) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 91),I= 19, 24) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 91),I= 25, 30) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 91),I= 31, 36) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 91),I= 37, 42) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 91),I= 43, 48) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 91),I= 49, 54) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 91),I= 55, 60) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 91),I= 61, 66) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 91),I= 67, 72) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 91),I= 73, 78) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 91),I= 79, 84) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 91),I= 85, 90) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 91),I= 91, 96) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 91),I= 97,102) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 91),I=103,108) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 91),I=109,114) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 91),I=115,120) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=6796,6825) /4096,-1024,-1024,128,128,1280,2,-16,20
+     $ ,2,128,-16,20,-160,-124,-142,-16,2,-124,20,56,-124,2,20,-1024
+     $ ,128,128,-16,-16,-160/
 C     1 T(6,7,1,2,5,3,4)
-      DATA (CF(I, 92),I=  1,  6) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 92),I=  7, 12) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 92),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 92),I= 19, 24) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 92),I= 25, 30) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 92),I= 31, 36) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 92),I= 37, 42) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 92),I= 43, 48) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 92),I= 49, 54) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 92),I= 55, 60) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 92),I= 61, 66) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 92),I= 67, 72) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 92),I= 73, 78) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 92),I= 79, 84) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 92),I= 85, 90) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 92),I= 91, 96) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 92),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 92),I=103,108) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 92),I=109,114) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 92),I=115,120) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=6826,6854) /4096,128,1280,-1024,128,20,2,2,-16,-16
+     $ ,128,-124,20,56,-124,2,20,20,-160,-124,-142,-16,2,128,-1024,-16
+     $ ,-160,128,-16/
 C     1 T(6,7,1,5,2,3,4)
-      DATA (CF(I, 93),I=  1,  6) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 93),I=  7, 12) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 93),I= 13, 18) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 93),I= 19, 24) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 93),I= 25, 30) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 93),I= 31, 36) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 93),I= 37, 42) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 93),I= 43, 48) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 93),I= 49, 54) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 93),I= 55, 60) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 93),I= 61, 66) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 93),I= 67, 72) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 93),I= 73, 78) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 93),I= 79, 84) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 93),I= 85, 90) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 93),I= 91, 96) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 93),I= 97,102) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 93),I=103,108) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 93),I=109,114) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 93),I=115,120) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=6855,6882) /4096,-1024,1280,128,20,-160,-124,-142,
+     $ -16,2,2,-16,20,2,128,-16,56,-124,-124,20,20,2,128,-16,-1024,128
+     $ ,-160,-16/
 C     1 T(6,7,2,1,5,3,4)
-      DATA (CF(I, 94),I=  1,  6) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 94),I=  7, 12) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 94),I= 13, 18) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 94),I= 19, 24) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 94),I= 25, 30) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 94),I= 31, 36) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 94),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 94),I= 43, 48) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 94),I= 49, 54) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 94),I= 55, 60) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 94),I= 61, 66) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 94),I= 67, 72) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 94),I= 73, 78) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 94),I= 79, 84) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 94),I= 85, 90) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 94),I= 91, 96) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 94),I= 97,102) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 94),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 94),I=109,114) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 94),I=115,120) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=6883,6909) /4096,128,-1024,-124,20,56,-124,2,20,20
+     $ ,2,2,-16,-16,128,-124,-142,20,-160,2,-16,-16,-160,128,-1024,-16
+     $ ,128/
 C     1 T(6,7,2,5,1,3,4)
-      DATA (CF(I, 95),I=  1,  6) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 95),I=  7, 12) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 95),I= 13, 18) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 95),I= 19, 24) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 95),I= 25, 30) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 95),I= 31, 36) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 95),I= 37, 42) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 95),I= 43, 48) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 95),I= 49, 54) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 95),I= 55, 60) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 95),I= 61, 66) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 95),I= 67, 72) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 95),I= 73, 78) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 95),I= 79, 84) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 95),I= 85, 90) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 95),I= 91, 96) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 95),I= 97,102) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 95),I=103,108) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 95),I=109,114) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 95),I=115,120) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=6910,6935) /4096,-1024,-124,-142,20,-160,2,-16,56,
+     $ -124,-124,20,20,2,2,-16,20,2,128,-16,-16,128,-160,-16,-1024,128/
 C     1 T(6,7,5,1,2,3,4)
-      DATA (CF(I, 96),I=  1,  6) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 96),I=  7, 12) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 96),I= 13, 18) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 96),I= 19, 24) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 96),I= 25, 30) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 96),I= 31, 36) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 96),I= 37, 42) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 96),I= 43, 48) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 96),I= 49, 54) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 96),I= 55, 60) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 96),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 96),I= 67, 72) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 96),I= 73, 78) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 96),I= 79, 84) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 96),I= 85, 90) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 96),I= 91, 96) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 96),I= 97,102) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 96),I=103,108) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 96),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 96),I=115,120) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
+      DATA (CF(I),I=6936,6960) /4096,56,-124,-124,20,20,2,-124,-142,20
+     $ ,-160,2,-16,20,2,2,-16,-16,128,-160,-16,-16,128,128,-1024/
 C     1 T(6,7,5,2,1,3,4)
-      DATA (CF(I, 97),I=  1,  6) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 97),I=  7, 12) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 97),I= 13, 18) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 97),I= 19, 24) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 97),I= 25, 30) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 97),I= 31, 36) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 97),I= 37, 42) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 97),I= 43, 48) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 97),I= 49, 54) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 97),I= 55, 60) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 97),I= 61, 66) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 97),I= 67, 72) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 97),I= 73, 78) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 97),I= 79, 84) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 97),I= 85, 90) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 97),I= 91, 96) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 97),I= 97,102) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 97),I=103,108) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 97),I=109,114) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 97),I=115,120) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
+      DATA (CF(I),I=6961,6984) /4096,-1024,-1024,128,128,1280,-1024
+     $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160
+     $ ,992,992,-448/
 C     1 T(7,1,2,5,6,3,4)
-      DATA (CF(I, 98),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 98),I=  7, 12) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 98),I= 13, 18) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 98),I= 19, 24) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 98),I= 25, 30) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 98),I= 31, 36) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 98),I= 37, 42) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 98),I= 43, 48) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 98),I= 49, 54) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 98),I= 55, 60) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 98),I= 61, 66) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 98),I= 67, 72) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 98),I= 73, 78) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 98),I= 79, 84) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 98),I= 85, 90) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 98),I= 91, 96) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 98),I= 97,102) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 98),I=103,108) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 98),I=109,114) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 98),I=115,120) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
+      DATA (CF(I),I=6985,7007) /4096,128,1280,-1024,128,128,-1024,-16,
+     $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136
+     $ ,992/
 C     1 T(7,1,2,6,5,3,4)
-      DATA (CF(I, 99),I=  1,  6) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 99),I=  7, 12) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 99),I= 13, 18) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 99),I= 19, 24) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 99),I= 25, 30) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 99),I= 31, 36) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 99),I= 37, 42) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 99),I= 43, 48) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 99),I= 49, 54) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 99),I= 55, 60) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 99),I= 61, 66) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 99),I= 67, 72) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 99),I= 73, 78) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 99),I= 79, 84) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 99),I= 85, 90) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 99),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 99),I= 97,102) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 99),I=103,108) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 99),I=109,114) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 99),I=115,120) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
+      DATA (CF(I),I=7008,7029) /4096,-1024,1280,128,128,-16,1280,-160
+     $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992/
 C     1 T(7,1,5,2,6,3,4)
-      DATA (CF(I,100),I=  1,  6) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,100),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,100),I= 13, 18) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,100),I= 19, 24) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,100),I= 25, 30) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,100),I= 31, 36) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,100),I= 37, 42) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,100),I= 43, 48) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,100),I= 49, 54) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,100),I= 55, 60) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,100),I= 61, 66) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,100),I= 67, 72) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,100),I= 73, 78) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,100),I= 79, 84) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,100),I= 85, 90) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,100),I= 91, 96) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,100),I= 97,102) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,100),I=103,108) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I,100),I=109,114) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,100),I=115,120) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
+      DATA (CF(I),I=7030,7050) /4096,128,-1024,-16,-160,-160,992,992,
+     $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160/
 C     1 T(7,1,5,6,2,3,4)
-      DATA (CF(I,101),I=  1,  6) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,101),I=  7, 12) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,101),I= 13, 18) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,101),I= 19, 24) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,101),I= 25, 30) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,101),I= 31, 36) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,101),I= 37, 42) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,101),I= 43, 48) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,101),I= 49, 54) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,101),I= 55, 60) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,101),I= 61, 66) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,101),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,101),I= 73, 78) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,101),I= 79, 84) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,101),I= 85, 90) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,101),I= 91, 96) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,101),I= 97,102) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I,101),I=103,108) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,101),I=109,114) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,101),I=115,120) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=7051,7070) /4096,-1024,-16,128,1136,992,1280,-160,
+     $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160/
 C     1 T(7,1,6,2,5,3,4)
-      DATA (CF(I,102),I=  1,  6) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,102),I=  7, 12) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,102),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,102),I= 19, 24) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,102),I= 25, 30) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,102),I= 31, 36) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,102),I= 37, 42) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,102),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,102),I= 49, 54) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,102),I= 55, 60) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,102),I= 61, 66) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,102),I= 67, 72) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,102),I= 73, 78) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,102),I= 79, 84) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,102),I= 85, 90) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,102),I= 91, 96) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,102),I= 97,102) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I,102),I=103,108) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,102),I=109,114) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,102),I=115,120) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=7071,7089) /4096,-160,-16,992,-448,-160,992,-16
+     $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16/
 C     1 T(7,1,6,5,2,3,4)
-      DATA (CF(I,103),I=  1,  6) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,103),I=  7, 12) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,103),I= 13, 18) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,103),I= 19, 24) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,103),I= 25, 30) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,103),I= 31, 36) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,103),I= 37, 42) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,103),I= 43, 48) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,103),I= 49, 54) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,103),I= 55, 60) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,103),I= 61, 66) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,103),I= 67, 72) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,103),I= 73, 78) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,103),I= 79, 84) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,103),I= 85, 90) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,103),I= 91, 96) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,103),I= 97,102) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,103),I=103,108) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I,103),I=109,114) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I,103),I=115,120) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
+      DATA (CF(I),I=7090,7107) /4096,-1024,-1024,128,128,1280,1280,
+     $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992/
 C     1 T(7,2,1,5,6,3,4)
-      DATA (CF(I,104),I=  1,  6) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,104),I=  7, 12) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,104),I= 13, 18) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,104),I= 19, 24) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,104),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,104),I= 31, 36) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,104),I= 37, 42) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,104),I= 43, 48) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,104),I= 49, 54) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,104),I= 55, 60) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,104),I= 61, 66) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,104),I= 67, 72) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,104),I= 73, 78) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,104),I= 79, 84) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,104),I= 85, 90) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,104),I= 91, 96) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,104),I= 97,102) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,104),I=103,108) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I,104),I=109,114) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,104),I=115,120) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
+      DATA (CF(I),I=7108,7124) /4096,128,1280,-1024,128,-160,992,-16,
+     $ -160,-448,992,1280,-160,128,-16,992,1136/
 C     1 T(7,2,1,6,5,3,4)
-      DATA (CF(I,105),I=  1,  6) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,105),I=  7, 12) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,105),I= 13, 18) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,105),I= 19, 24) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,105),I= 25, 30) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,105),I= 31, 36) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,105),I= 37, 42) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,105),I= 43, 48) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,105),I= 49, 54) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,105),I= 55, 60) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,105),I= 61, 66) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,105),I= 67, 72) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,105),I= 73, 78) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,105),I= 79, 84) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,105),I= 85, 90) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,105),I= 91, 96) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I,105),I= 97,102) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I,105),I=103,108) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I,105),I=109,114) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,105),I=115,120) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=7125,7140) /4096,-1024,1280,128,128,-16,-1024,128,
+     $ -160,-16,992,-448,-160,-16,992,-160/
 C     1 T(7,2,5,1,6,3,4)
-      DATA (CF(I,106),I=  1,  6) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,106),I=  7, 12) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,106),I= 13, 18) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,106),I= 19, 24) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,106),I= 25, 30) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,106),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,106),I= 37, 42) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,106),I= 43, 48) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,106),I= 49, 54) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,106),I= 55, 60) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,106),I= 61, 66) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,106),I= 67, 72) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,106),I= 73, 78) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,106),I= 79, 84) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,106),I= 85, 90) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,106),I= 91, 96) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,106),I= 97,102) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I,106),I=103,108) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,106),I=109,114) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,106),I=115,120) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
+      DATA (CF(I),I=7141,7155) /4096,128,-1024,-16,-160,128,-1024,-16
+     $ ,128,1136,992,-16,128,-160,1280/
 C     1 T(7,2,5,6,1,3,4)
-      DATA (CF(I,107),I=  1,  6) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,107),I=  7, 12) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,107),I= 13, 18) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,107),I= 19, 24) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,107),I= 25, 30) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,107),I= 31, 36) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,107),I= 37, 42) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,107),I= 43, 48) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,107),I= 49, 54) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,107),I= 55, 60) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,107),I= 61, 66) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,107),I= 67, 72) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I,107),I= 73, 78) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,107),I= 79, 84) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,107),I= 85, 90) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,107),I= 91, 96) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,107),I= 97,102) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,107),I=103,108) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I,107),I=109,114) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,107),I=115,120) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=7156,7169) /4096,-1024,992,-448,-160,-16,992,-160
+     $ ,128,-16,-1024,128,-160,-16/
 C     1 T(7,2,6,1,5,3,4)
-      DATA (CF(I,108),I=  1,  6) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,108),I=  7, 12) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,108),I= 13, 18) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,108),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,108),I= 25, 30) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,108),I= 31, 36) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,108),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,108),I= 43, 48) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,108),I= 49, 54) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,108),I= 55, 60) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,108),I= 61, 66) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,108),I= 67, 72) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,108),I= 73, 78) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,108),I= 79, 84) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,108),I= 85, 90) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,108),I= 91, 96) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,108),I= 97,102) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,108),I=103,108) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I,108),I=109,114) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,108),I=115,120) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=7170,7182) /4096,1136,992,-16,128,-160,1280,-16,
+     $ -160,128,-1024,-16,128/
 C     1 T(7,2,6,5,1,3,4)
-      DATA (CF(I,109),I=  1,  6) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,109),I=  7, 12) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,109),I= 13, 18) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,109),I= 19, 24) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,109),I= 25, 30) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,109),I= 31, 36) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,109),I= 37, 42) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,109),I= 43, 48) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,109),I= 49, 54) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,109),I= 55, 60) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,109),I= 61, 66) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,109),I= 67, 72) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,109),I= 73, 78) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,109),I= 79, 84) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,109),I= 85, 90) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,109),I= 91, 96) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,109),I= 97,102) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,109),I=103,108) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I,109),I=109,114) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I,109),I=115,120) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=7183,7194) /4096,-1024,-1024,128,128,1280,992,-160
+     $ ,-448,992,-16,-160/
 C     1 T(7,5,1,2,6,3,4)
-      DATA (CF(I,110),I=  1,  6) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,110),I=  7, 12) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,110),I= 13, 18) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,110),I= 19, 24) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,110),I= 25, 30) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,110),I= 31, 36) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,110),I= 37, 42) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,110),I= 43, 48) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,110),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,110),I= 55, 60) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,110),I= 61, 66) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,110),I= 67, 72) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,110),I= 73, 78) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,110),I= 79, 84) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,110),I= 85, 90) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,110),I= 91, 96) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,110),I= 97,102) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,110),I=103,108) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,110),I=109,114) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I,110),I=115,120) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
+      DATA (CF(I),I=7195,7205) /4096,128,1280,-1024,128,-160,1280,992
+     $ ,1136,128,-16/
 C     1 T(7,5,1,6,2,3,4)
-      DATA (CF(I,111),I=  1,  6) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,111),I=  7, 12) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,111),I= 13, 18) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,111),I= 19, 24) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,111),I= 25, 30) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,111),I= 31, 36) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,111),I= 37, 42) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,111),I= 43, 48) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,111),I= 49, 54) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,111),I= 55, 60) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,111),I= 61, 66) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,111),I= 67, 72) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,111),I= 73, 78) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,111),I= 79, 84) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,111),I= 85, 90) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,111),I= 91, 96) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,111),I= 97,102) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I,111),I=103,108) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,111),I=109,114) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I,111),I=115,120) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=7206,7215) /4096,-1024,1280,128,-448,992,992,-160,
+     $ -160,-16/
 C     1 T(7,5,2,1,6,3,4)
-      DATA (CF(I,112),I=  1,  6) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,112),I=  7, 12) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,112),I= 13, 18) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,112),I= 19, 24) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,112),I= 25, 30) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,112),I= 31, 36) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,112),I= 37, 42) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,112),I= 43, 48) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,112),I= 49, 54) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,112),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,112),I= 61, 66) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,112),I= 67, 72) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,112),I= 73, 78) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,112),I= 79, 84) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,112),I= 85, 90) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,112),I= 91, 96) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,112),I= 97,102) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,112),I=103,108) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,112),I=109,114) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,112),I=115,120) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=7216,7224) /4096,128,-1024,992,1136,-160,1280,-16
+     $ ,128/
 C     1 T(7,5,2,6,1,3,4)
-      DATA (CF(I,113),I=  1,  6) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,113),I=  7, 12) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,113),I= 13, 18) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,113),I= 19, 24) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,113),I= 25, 30) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,113),I= 31, 36) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,113),I= 37, 42) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,113),I= 43, 48) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I,113),I= 49, 54) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,113),I= 55, 60) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,113),I= 61, 66) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,113),I= 67, 72) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,113),I= 73, 78) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,113),I= 79, 84) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,113),I= 85, 90) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,113),I= 91, 96) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,113),I= 97,102) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,113),I=103,108) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,113),I=109,114) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I,113),I=115,120) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=7225,7232) /4096,-1024,-16,128,-160,-16,-1024,128/
 C     1 T(7,5,6,1,2,3,4)
-      DATA (CF(I,114),I=  1,  6) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,114),I=  7, 12) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,114),I= 13, 18) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,114),I= 19, 24) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I,114),I= 25, 30) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,114),I= 31, 36) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,114),I= 37, 42) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,114),I= 43, 48) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,114),I= 49, 54) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,114),I= 55, 60) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,114),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,114),I= 67, 72) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,114),I= 73, 78) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,114),I= 79, 84) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,114),I= 85, 90) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,114),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,114),I= 97,102) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,114),I=103,108) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,114),I=109,114) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I,114),I=115,120) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
+      DATA (CF(I),I=7233,7239) /4096,-160,-16,-16,128,128,-1024/
 C     1 T(7,5,6,2,1,3,4)
-      DATA (CF(I,115),I=  1,  6) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,115),I=  7, 12) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,115),I= 13, 18) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,115),I= 19, 24) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,115),I= 25, 30) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,115),I= 31, 36) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,115),I= 37, 42) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,115),I= 43, 48) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,115),I= 49, 54) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,115),I= 55, 60) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,115),I= 61, 66) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,115),I= 67, 72) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,115),I= 73, 78) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,115),I= 79, 84) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,115),I= 85, 90) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,115),I= 91, 96) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,115),I= 97,102) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,115),I=103,108) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,115),I=109,114) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,115),I=115,120) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
+      DATA (CF(I),I=7240,7245) /4096,-1024,-1024,128,128,1280/
 C     1 T(7,6,1,2,5,3,4)
-      DATA (CF(I,116),I=  1,  6) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,116),I=  7, 12) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,116),I= 13, 18) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,116),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,116),I= 25, 30) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,116),I= 31, 36) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,116),I= 37, 42) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,116),I= 43, 48) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,116),I= 49, 54) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,116),I= 55, 60) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,116),I= 61, 66) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,116),I= 67, 72) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,116),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,116),I= 79, 84) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,116),I= 85, 90) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,116),I= 91, 96) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,116),I= 97,102) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,116),I=103,108) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,116),I=109,114) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,116),I=115,120) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
+      DATA (CF(I),I=7246,7250) /4096,128,1280,-1024,128/
 C     1 T(7,6,1,5,2,3,4)
-      DATA (CF(I,117),I=  1,  6) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,117),I=  7, 12) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,117),I= 13, 18) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,117),I= 19, 24) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,117),I= 25, 30) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,117),I= 31, 36) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,117),I= 37, 42) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,117),I= 43, 48) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,117),I= 49, 54) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,117),I= 55, 60) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,117),I= 61, 66) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,117),I= 67, 72) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,117),I= 73, 78) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,117),I= 79, 84) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,117),I= 85, 90) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,117),I= 91, 96) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,117),I= 97,102) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,117),I=103,108) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,117),I=109,114) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,117),I=115,120) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
+      DATA (CF(I),I=7251,7254) /4096,-1024,1280,128/
 C     1 T(7,6,2,1,5,3,4)
-      DATA (CF(I,118),I=  1,  6) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,118),I=  7, 12) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,118),I= 13, 18) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,118),I= 19, 24) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,118),I= 25, 30) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,118),I= 31, 36) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,118),I= 37, 42) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,118),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,118),I= 49, 54) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,118),I= 55, 60) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,118),I= 61, 66) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,118),I= 67, 72) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,118),I= 73, 78) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,118),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,118),I= 85, 90) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,118),I= 91, 96) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,118),I= 97,102) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,118),I=103,108) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,118),I=109,114) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,118),I=115,120) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
+      DATA (CF(I),I=7255,7257) /4096,128,-1024/
 C     1 T(7,6,2,5,1,3,4)
-      DATA (CF(I,119),I=  1,  6) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,119),I=  7, 12) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,119),I= 13, 18) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,119),I= 19, 24) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,119),I= 25, 30) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,119),I= 31, 36) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,119),I= 37, 42) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,119),I= 43, 48) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,119),I= 49, 54) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,119),I= 55, 60) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,119),I= 61, 66) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,119),I= 67, 72) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,119),I= 73, 78) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,119),I= 79, 84) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,119),I= 85, 90) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,119),I= 91, 96) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,119),I= 97,102) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,119),I=103,108) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,119),I=109,114) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,119),I=115,120) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
+      DATA (CF(I),I=7258,7259) /4096,-1024/
 C     1 T(7,6,5,1,2,3,4)
-      DATA (CF(I,120),I=  1,  6) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,120),I=  7, 12) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,120),I= 13, 18) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,120),I= 19, 24) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,120),I= 25, 30) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,120),I= 31, 36) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,120),I= 37, 42) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,120),I= 43, 48) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,120),I= 49, 54) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,120),I= 55, 60) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,120),I= 61, 66) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,120),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,120),I= 73, 78) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,120),I= 79, 84) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,120),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,120),I= 91, 96) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,120),I= 97,102) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,120),I=103,108) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,120),I=109,114) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,120),I=115,120) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
+      DATA (CF(I),I=7260,7260) /4096/
 C     1 T(7,6,5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -18811,10 +10161,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -18823,6 +10175,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/madevent b/epochX/cudacpp/gg_ttggg.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/madevent
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
index 53dd560ed6..da11e740d9 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
index 47a3a011b8..a5e188e4f8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
index 76066c7bb1..24e0e80f84 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 37d3314a5d..4feba239bd 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +57,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005965471267700195 [0m
+[1;32mDEBUG: model prefixing  takes 0.004603147506713867 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,33 +150,33 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.863 s
+1 processes with 1240 diagrams generated in 1.800 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.535 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
+Generated helas calls for 1 subprocesses (1240 diagrams) in 5.597 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.348 s
+ALOHA: aloha creates 5 routines in  0.328 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -190,17 +189,17 @@ ALOHA: aloha creates 5 routines in  0.348 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 quit
 
-real	0m12.948s
-user	0m12.781s
-sys	0m0.107s
-Code generation completed in 13 seconds
+real	0m11.011s
+user	0m10.838s
+sys	0m0.129s
+Code generation completed in 11 seconds
diff --git a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
index 07099839d3..763cfce31f 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 120;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -31856,272 +31912,43 @@ namespace mg5amcCpu
       jamp_sv[116] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxggg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=120)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120]
-
-      // The color matrix (initialize all array elements, with ncolor=120)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 },
-        { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 },
-        { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 },
-        { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 },
-        { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 },
-        { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 },
-        { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 },
-        { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 },
-        { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 },
-        { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 },
-        { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 },
-        { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 },
-        { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 },
-        { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 },
-        { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 },
-        { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 },
-        { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 },
-        { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 },
-        { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 },
-        { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 },
-        { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 },
-        { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 },
-        { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 },
-        { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 },
-        { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 },
-        { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 },
-        { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 },
-        { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 },
-        { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 },
-        { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 },
-        { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 },
-        { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 },
-        { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 },
-        { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 },
-        { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 },
-        { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 },
-        { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 },
-        { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 },
-        { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 },
-        { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 },
-        { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 },
-        { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 },
-        { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 },
-        { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 },
-        { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 },
-        { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 },
-        { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 },
-        { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 },
-        { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 },
-        { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 },
-        { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 },
-        { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 },
-        { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 },
-        { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 },
-        { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 },
-        { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 },
-        { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 },
-        { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 },
-        { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 },
-        { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 },
-        { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 },
-        { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 },
-        { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 },
-        { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 },
-        { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 },
-        { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 },
-        { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 },
-        { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 },
-        { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 },
-        { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 },
-        { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 },
-        { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 },
-        { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 },
-        { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 },
-        { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 },
-        { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 },
-        { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 },
-        { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 },
-        { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 },
-        { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 },
-        { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 },
-        { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 },
-        { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 },
-        { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 },
-        { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 },
-        { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 },
-        { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 },
-        { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 },
-        { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 },
-        { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 },
-        { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 },
-        { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 },
-        { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 },
-        { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 },
-        { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 },
-        { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 },
-        { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 },
-        { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 },
-        { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 },
-        { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 },
-        { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 },
-        { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 },
-        { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 },
-        { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 },
-        { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 },
-        { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 },
-        { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 },
-        { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 },
-        { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 },
-        { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 },
-        { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 },
-        { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 },
-        { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 },
-        { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 },
-        { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 },
-        { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 },
-        { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 },
-        { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 },
-        { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
-        { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -32273,7 +32100,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -32309,6 +32140,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -32352,6 +32187,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -32472,8 +32311,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -32481,25 +32320,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -32512,7 +32367,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -32522,26 +32377,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -32554,15 +32410,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -32599,33 +32462,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -32644,13 +32661,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -32662,18 +32673,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -32698,93 +32714,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -32826,7 +32779,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -32849,7 +32802,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -32858,21 +32811,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -32886,8 +32841,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -32903,11 +32860,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -33009,14 +32967,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
index 2eb1e066ff..f20243637a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 128; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 1240; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 120; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc
new file mode 100644
index 0000000000..dea7f9fdb2
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc
@@ -0,0 +1,545 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=120)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120]
+
+  // The color matrix (initialize all array elements, with ncolor=120)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 },
+    { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 },
+    { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 },
+    { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 },
+    { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 },
+    { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 },
+    { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 },
+    { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 },
+    { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 },
+    { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 },
+    { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 },
+    { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 },
+    { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 },
+    { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 },
+    { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 },
+    { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 },
+    { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 },
+    { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 },
+    { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 },
+    { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 },
+    { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 },
+    { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 },
+    { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 },
+    { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 },
+    { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 },
+    { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 },
+    { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 },
+    { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 },
+    { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 },
+    { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 },
+    { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 },
+    { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 },
+    { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 },
+    { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 },
+    { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 },
+    { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 },
+    { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 },
+    { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 },
+    { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 },
+    { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 },
+    { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 },
+    { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 },
+    { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 },
+    { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 },
+    { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 },
+    { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 },
+    { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 },
+    { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 },
+    { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 },
+    { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 },
+    { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 },
+    { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 },
+    { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 },
+    { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 },
+    { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 },
+    { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 },
+    { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 },
+    { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 },
+    { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 },
+    { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 },
+    { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 },
+    { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 },
+    { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 },
+    { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 },
+    { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 },
+    { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 },
+    { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 },
+    { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 },
+    { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 },
+    { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 },
+    { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 },
+    { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 },
+    { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 },
+    { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 },
+    { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 },
+    { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 },
+    { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 },
+    { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 },
+    { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 },
+    { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 },
+    { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 },
+    { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 },
+    { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 },
+    { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 },
+    { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 },
+    { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 },
+    { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 },
+    { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 },
+    { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 },
+    { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 },
+    { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 },
+    { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 },
+    { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 },
+    { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 },
+    { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 },
+    { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 },
+    { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 },
+    { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 },
+    { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 },
+    { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 },
+    { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 },
+    { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 },
+    { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 },
+    { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 },
+    { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 },
+    { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 },
+    { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 },
+    { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 },
+    { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 },
+    { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 },
+    { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 },
+    { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 },
+    { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 },
+    { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 },
+    { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 },
+    { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 },
+    { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 },
+    { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 },
+    { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
+    { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fbridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/makefile_original.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
index 53dd560ed6..da11e740d9 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
index 47a3a011b8..a5e188e4f8 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
index 76066c7bb1..24e0e80f84 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
index d3c4ca5695..7d34de72f8 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 1baee42e06..10d129eb59 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +56,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006198406219482422 [0m
+[1;32mDEBUG: model prefixing  takes 0.006429910659790039 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -166,21 +165,21 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.078 s
+8 processes with 40 diagrams generated in 0.115 s
 Total: 8 processes with 40 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gq_ttq 
 INFO: remove old information in CODEGEN_mad_gq_ttq 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Processing color information for process: g u > t t~ u @1 
@@ -200,9 +199,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -211,62 +210,50 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s
-Wrote files for 32 helas calls in 0.164 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.042 s
+Wrote files for 32 helas calls in 0.167 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.148 s
+ALOHA: aloha creates 2 routines in  0.135 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.132 s
+ALOHA: aloha creates 4 routines in  0.100 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 254 (offset 27 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 254 (offset 27 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.210s
-user	0m1.890s
-sys	0m0.303s
-Code generation completed in 2 seconds
+real	0m3.071s
+user	0m2.389s
+sys	0m0.594s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -279,7 +266,7 @@ Code generation completed in 2 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -287,10 +274,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -309,7 +295,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -317,10 +303,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
+++ b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
index 795e11afaf..7e99b87668 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
index 66a805e521..3db737130c 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
@@ -109,6 +109,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
index 8c0f1e2199..47c2051950 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
@@ -109,6 +109,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/.make_opts b/epochX/cudacpp/gq_ttq.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/.make_opts
+++ b/epochX/cudacpp/gq_ttq.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f b/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc b/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/make_opts b/epochX/cudacpp/gq_ttq.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/make_opts
+++ b/epochX/cudacpp/gq_ttq.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/makefile b/epochX/cudacpp/gq_ttq.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/makefile
+++ b/epochX/cudacpp/gq_ttq.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc b/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index 99573ab87a..20611bde8f 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -405,156 +461,43 @@ namespace mg5amcCpu
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -610,7 +553,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -644,6 +591,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -685,6 +636,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -805,8 +760,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -814,25 +769,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -845,7 +816,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -855,26 +826,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -887,15 +859,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -932,33 +911,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -977,13 +1110,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -995,18 +1122,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1031,93 +1163,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1159,7 +1228,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1182,7 +1251,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1191,21 +1260,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1219,8 +1290,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1236,11 +1309,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1342,14 +1416,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index b501a9772e..2c0025c7b9 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
index b0cc58e89c..340d51dbfa 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
index 2b281a8200..83f5f0b209 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/configs.inc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/configs.inc
index 225cf5aca4..0a6b8dbc07 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/configs.inc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/configs.inc
@@ -57,3 +57,5 @@ C     Diagram 5
       DATA (SPROP(I,-3,5),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/5/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f
index c2eadb2c31..aa93a3d195 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fbridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
index 1efce64e40..bb9c6d6440 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -254,17 +251,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -341,7 +327,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -387,7 +373,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -430,31 +417,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,3,2) T(5,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,3,4) T(5,2)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,5,2) T(3,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,5,4) T(3,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -507,10 +491,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -519,6 +505,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 6dc0abd17c..6dbbb43f91 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -405,156 +461,43 @@ namespace mg5amcCpu
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -610,7 +553,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -644,6 +591,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -685,6 +636,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -805,8 +760,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -814,25 +769,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -845,7 +816,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -855,26 +826,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -887,15 +859,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -932,33 +911,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -977,13 +1110,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -995,18 +1122,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1031,93 +1163,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1159,7 +1228,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1182,7 +1251,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1191,21 +1260,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1219,8 +1290,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1236,11 +1309,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1342,14 +1416,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index d658e0394e..7a811e35e9 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
index e36675626f..f9cde14dc2 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
index 61bb13c3e7..136c6cded7 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/configs.inc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/configs.inc
index 693e4354b0..28a94fd35a 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/configs.inc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/configs.inc
@@ -57,3 +57,5 @@ C     Diagram 5
       DATA (SPROP(I,-3,5),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/5/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f
index c2eadb2c31..aa93a3d195 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fbridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
index c8fbbe9e22..49b7ddbf25 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -254,17 +251,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -341,7 +327,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -387,7 +373,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -430,31 +417,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,2,4) T(3,5)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,2,5) T(3,4)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,3,4) T(2,5)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,3,5) T(2,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -507,10 +491,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -519,6 +505,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data b/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/madevent b/epochX/cudacpp/gq_ttq.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/madevent
+++ b/epochX/cudacpp/gq_ttq.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
index a304fc85c8..c6aa6132b8 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
index 998cb505a0..c5d271333d 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
index 1565ed5888..890ccfa493 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index 8249ac5d67..22b4bcef38 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +56,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006358146667480469 [0m
+[1;32mDEBUG: model prefixing  takes 0.004916191101074219 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -166,13 +165,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.078 s
+8 processes with 40 diagrams generated in 0.068 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Processing color information for process: g u > t t~ u @1 
@@ -184,45 +183,45 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1
 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.028 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.145 s
+ALOHA: aloha creates 2 routines in  0.121 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 quit
 
-real	0m0.670s
-user	0m0.588s
-sys	0m0.061s
+real	0m0.648s
+user	0m0.562s
+sys	0m0.079s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
+++ b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
index 81ab8669a5..7307dc9db3 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -400,156 +456,43 @@ namespace mg5amcCpu
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -605,7 +548,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -639,6 +586,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -680,6 +631,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -800,8 +755,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -809,25 +764,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -840,7 +811,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -850,26 +821,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -882,15 +854,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -927,33 +906,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -972,13 +1105,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -990,18 +1117,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1026,93 +1158,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1154,7 +1223,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1177,7 +1246,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1186,21 +1255,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1214,8 +1285,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1231,11 +1304,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1337,14 +1411,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
index b501a9772e..2c0025c7b9 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fbridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/makefile_original.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
index c1c42990a2..d959d2636a 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -400,156 +456,43 @@ namespace mg5amcCpu
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -605,7 +548,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -639,6 +586,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -680,6 +631,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -800,8 +755,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -809,25 +764,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -840,7 +811,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -850,26 +821,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -882,15 +854,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -927,33 +906,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -972,13 +1105,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -990,18 +1117,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1026,93 +1158,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1154,7 +1223,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1177,7 +1246,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1186,21 +1255,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1214,8 +1285,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1231,11 +1304,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1337,14 +1411,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
index d658e0394e..7a811e35e9 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fbridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/makefile_original.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
index a304fc85c8..c6aa6132b8 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
index 998cb505a0..c5d271333d 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
index 1565ed5888..890ccfa493 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
index d3c4ca5695..7d34de72f8 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
index c46ef95a65..f4896d16ca 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,17 +46,16 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model heft
 INFO: Restrict model heft with file models/heft/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
@@ -123,21 +122,21 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~
 generate g g > b b~ HIW<=1
 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1  
 INFO: Process has 4 diagrams 
-1 processes with 4 diagrams generated in 0.006 s
+1 processes with 4 diagrams generated in 0.007 s
 Total: 1 processes with 4 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb 
 INFO: remove old information in CODEGEN_mad_heft_gg_bb 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 
@@ -149,59 +148,54 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_bbx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 4 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 1 subprocesses (4 diagrams) in 0.009 s
-Wrote files for 12 helas calls in 0.076 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 4 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 1 subprocesses (4 diagrams) in 0.011 s
+Wrote files for 12 helas calls in 0.085 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 4 routines in  0.266 s
+ALOHA: aloha creates 4 routines in  0.231 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 8 routines in  0.250 s
+ALOHA: aloha creates 8 routines in  0.197 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFS2
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h
-INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h
+INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses/P1_gg_bbx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.141s
-user	0m1.860s
-sys	0m0.270s
+real	0m2.628s
+user	0m2.164s
+sys	0m0.450s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -215,7 +209,7 @@ Code generation completed in 2 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -223,10 +217,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -245,7 +238,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -253,10 +246,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/heft_gg_bb.mad/COPYRIGHT b/epochX/cudacpp/heft_gg_bb.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/COPYRIGHT
+++ b/epochX/cudacpp/heft_gg_bb.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat
index 92581deeee..abc60404ab 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat
index 8af20dc4e4..3802880982 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat
index 0815703ee4..6917ce597f 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt b/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts b/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f b/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc b/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/makefile b/epochX/cudacpp/heft_gg_bb.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/makefile
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc b/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h
index 90075da66e..7d7b960511 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_heft_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc
index 5d6a4e1f06..4630760b2c 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_heft.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_heft_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_heft_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 3;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -381,155 +437,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_bbx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=3)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 1 }; // 1-D array[3]
-
-      // The color matrix (initialize all array elements, with ncolor=3)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2, 6 },
-        { -2, 16, 6 },
-        { 2, 2, 6 } }; // 2-D array[3][3]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -569,7 +513,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -602,6 +550,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MB );
     m_masses.push_back( m_pars->mdl_MB );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MB, (fptype)m_pars->mdl_MH, (fptype)m_pars->mdl_WH };
@@ -643,6 +595,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_heft::ZERO );
     m_masses.push_back( Parameters_heft::mdl_MB );
     m_masses.push_back( Parameters_heft::mdl_MB );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -763,8 +719,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -772,25 +728,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -803,7 +775,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -813,26 +785,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -845,15 +818,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -890,33 +870,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -935,13 +1069,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -953,18 +1081,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -989,93 +1122,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1117,7 +1187,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1140,7 +1210,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1149,21 +1219,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1177,8 +1249,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1194,11 +1268,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1300,14 +1375,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h
index 30c5663297..cacb35c052 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_heft.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 4; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 3; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f
index 0b39d55964..263997c37e 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f
index c57e06d578..3eaacf358b 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc
new file mode 100644
index 0000000000..94b1137d64
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc
@@ -0,0 +1,428 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=3)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 1 }; // 1-D array[3]
+
+  // The color matrix (initialize all array elements, with ncolor=3)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2, 6 },
+    { -2, 16, 6 },
+    { 2, 2, 6 } }; // 2-D array[3][3]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/configs.inc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/configs.inc
index b94e284b2f..8f12a38cbe 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/configs.inc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/configs.inc
@@ -30,3 +30,5 @@ C     Diagram 4
       DATA (SPROP(I,-2,4),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/4/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f
index ec5722702a..30cca27587 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fbridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/makefile_original.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f
index 598338d03e..e5700f7694 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -227,17 +224,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -307,7 +293,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -350,7 +336,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -393,26 +380,26 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  3) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,2.000000000000000D+00/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  3) /16,-4,12/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  3) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I=  4,  5) /16,12/
 C     1 T(2,1,3,4)
-      DATA (CF(I,  3),I=  1,  3) /2.000000000000000D+00
-     $ ,2.000000000000000D+00,6.000000000000000D+00/
+      DATA (CF(I),I=  6,  6) /18/
 C     1 T(3,4) Tr(1,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(MDL_MB
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WH.NE.0D0) FK_MDL_WH = SIGN(MAX(ABS(MDL_WH), ABS(MDL_MH
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WH)
+        FK_ZERO = 0D0
+        IF(MDL_WH.NE.0D0) THEN
+          FK_MDL_WH = SIGN(MAX(ABS(MDL_WH), ABS(MDL_MH
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WH)
+        ELSE
+          FK_MDL_WH = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -455,10 +442,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -467,6 +456,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/madevent b/epochX/cudacpp/heft_gg_bb.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/madevent
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h
index 1b04401547..534bb65c13 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc
index 0fa5a34cf0..3b4c719337 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h
index 0faa7bb71e..7ab2db5300 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
index 04039fcd14..c8cdee7d2a 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
+++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,17 +46,16 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model heft
 INFO: Restrict model heft with file models/heft/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
@@ -123,49 +122,49 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~
 generate g g > b b~ HIW<=1
 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1  
 INFO: Process has 4 diagrams 
-1 processes with 4 diagrams generated in 0.006 s
+1 processes with 4 diagrams generated in 0.008 s
 Total: 1 processes with 4 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. 
-Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. 
+Generated helas calls for 1 subprocesses (4 diagrams) in 0.010 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 4 routines in  0.261 s
+ALOHA: aloha creates 4 routines in  0.214 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFS2
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h
-INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h
+INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
 quit
 
-real	0m0.646s
-user	0m0.583s
-sys	0m0.051s
-Code generation completed in 1 seconds
+real	0m0.656s
+user	0m0.586s
+sys	0m0.063s
+Code generation completed in 0 seconds
diff --git a/epochX/cudacpp/heft_gg_bb.sa/COPYRIGHT b/epochX/cudacpp/heft_gg_bb.sa/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/COPYRIGHT
+++ b/epochX/cudacpp/heft_gg_bb.sa/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h
index 90075da66e..7d7b960511 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_heft_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc
index b9f394434a..e9ac65dc13 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_heft.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_heft_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_heft_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 3;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -377,155 +433,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_bbx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=3)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 1 }; // 1-D array[3]
-
-      // The color matrix (initialize all array elements, with ncolor=3)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2, 6 },
-        { -2, 16, 6 },
-        { 2, 2, 6 } }; // 2-D array[3][3]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -565,7 +509,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -598,6 +546,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MB );
     m_masses.push_back( m_pars->mdl_MB );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MB, (fptype)m_pars->mdl_MH, (fptype)m_pars->mdl_WH };
@@ -639,6 +591,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_heft::ZERO );
     m_masses.push_back( Parameters_heft::mdl_MB );
     m_masses.push_back( Parameters_heft::mdl_MB );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -759,8 +715,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -768,25 +724,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -799,7 +771,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -809,26 +781,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -841,15 +814,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -886,33 +866,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -931,13 +1065,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -949,18 +1077,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -985,93 +1118,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1113,7 +1183,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1136,7 +1206,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1145,21 +1215,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1173,8 +1245,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1190,11 +1264,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1296,14 +1371,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h
index 30c5663297..cacb35c052 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_heft.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 4; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 3; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc
new file mode 100644
index 0000000000..94b1137d64
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc
@@ -0,0 +1,428 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=3)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 1 }; // 1-D array[3]
+
+  // The color matrix (initialize all array elements, with ncolor=3)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2, 6 },
+    { -2, 16, 6 },
+    { 2, 2, 6 } }; // 2-D array[3][3]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fbridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/makefile_original.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h
index 1b04401547..534bb65c13 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc
index 0fa5a34cf0..3b4c719337 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h
index 0faa7bb71e..7ab2db5300 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
index d3c4ca5695..7d34de72f8 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
index 11380fe474..1526092aa7 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +56,7 @@ set zerowidth_tchannel F
 import model sm-no_b_mass
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006017446517944336 [0m
+[1;32mDEBUG: model prefixing  takes 0.006911277770996094 [0m
 INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -181,7 +180,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w-
 INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- 
 INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ 
 INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ 
-4 processes with 8 diagrams generated in 0.107 s
+4 processes with 8 diagrams generated in 0.123 s
 Total: 4 processes with 8 diagrams
 add process p p > t t~ w j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -223,21 +222,21 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~
 INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g 
 INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ 
 INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g 
-12 processes with 144 diagrams generated in 0.640 s
+12 processes with 144 diagrams generated in 0.841 s
 Total: 16 processes with 152 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW 
 INFO: remove old information in CODEGEN_mad_nobm_pp_ttW 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 
 INFO: Processing color information for process: g u > t t~ w+ d @1 
@@ -271,9 +270,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ w+ d WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxwpd 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P1_gd_ttxwmu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -282,9 +281,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g d > t t~ w- u WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gd_ttxwmu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P1_gux_ttxwmdx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -293,9 +292,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ w- d~ WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxwmdx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P1_gdx_ttxwpux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -304,9 +303,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g d~ > t t~ w+ u~ WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gdx_ttxwpux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P1_udx_ttxwpg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -315,9 +314,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u d~ > t t~ w+ g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group udx_ttxwpg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P1_dux_ttxwmg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -326,9 +325,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: d u~ > t t~ w- g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group dux_ttxwmg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P0_udx_ttxwp 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -337,9 +336,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u d~ > t t~ w+ WEIGHTED<=4 
 INFO: Finding symmetric diagrams for subprocess group udx_ttxwp 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P0_dux_ttxwm 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -348,21 +347,21 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: d u~ > t t~ w- WEIGHTED<=4 
 INFO: Finding symmetric diagrams for subprocess group dux_ttxwm 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 8 subprocesses (76 diagrams) in 0.202 s
-Wrote files for 212 helas calls in 0.830 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 8 subprocesses (76 diagrams) in 0.220 s
+Wrote files for 212 helas calls in 1.061 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
-ALOHA: aloha creates 3 routines in  0.204 s
+ALOHA: aloha creates 3 routines in  0.221 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
-ALOHA: aloha creates 6 routines in  0.200 s
+ALOHA: aloha creates 6 routines in  0.185 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -370,74 +369,32 @@ ALOHA: aloha creates 6 routines in  0.200 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h
-INFO: Created file HelAmps_sm_no_b_mass.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h
+INFO: Created file HelAmps_sm_no_b_mass.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc
 INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_dux_ttxwm; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 72 (offset 1 line).
-Hunk #2 succeeded at 268 (offset 41 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_udx_ttxwp; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 72 (offset 1 line).
-Hunk #2 succeeded at 268 (offset 41 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_dux_ttxwmg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 72 (offset 1 line).
-Hunk #2 succeeded at 316 (offset 89 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gd_ttxwmu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 72 (offset 1 line).
-Hunk #2 succeeded at 316 (offset 89 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gdx_ttxwpux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 72 (offset 1 line).
-Hunk #2 succeeded at 316 (offset 89 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gu_ttxwpd; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 72 (offset 1 line).
-Hunk #2 succeeded at 316 (offset 89 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gux_ttxwmdx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 72 (offset 1 line).
-Hunk #2 succeeded at 316 (offset 89 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_udx_ttxwpg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 72 (offset 1 line).
-Hunk #2 succeeded at 316 (offset 89 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.658s
-user	0m4.105s
-sys	0m0.537s
-Code generation completed in 5 seconds
+real	0m6.431s
+user	0m5.417s
+sys	0m0.963s
+Code generation completed in 6 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -450,7 +407,7 @@ Code generation completed in 5 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -458,10 +415,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -480,7 +436,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -488,10 +444,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/COPYRIGHT b/epochX/cudacpp/nobm_pp_ttW.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/COPYRIGHT
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat
index 72b31976a0..961c6b1d6e 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat
index 5eca3e3f2b..48beb899d9 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat
@@ -127,6 +127,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat
index 3b445d02a0..c22a9e0249 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat
@@ -127,6 +127,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt b/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts b/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f b/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc b/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile b/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc b/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc
index 2588190439..e169c1f193 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h
index 1e7cc050f7..71a4c3f155 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_no_b_mass_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc
index 97050f0aa2..f17f7676e3 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,9 +99,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -107,10 +110,7 @@ namespace mg5amcCpu
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,43 +169,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -217,7 +273,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -226,14 +281,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -259,14 +317,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -290,7 +344,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -304,7 +357,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -315,6 +367,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -360,154 +416,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_dux_ttxwm()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 9, 3 },
-        { 3, 9 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -579,7 +524,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -613,6 +562,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -655,6 +608,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -775,8 +732,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -784,25 +741,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -815,7 +788,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -825,26 +798,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -857,15 +831,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -902,33 +883,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -947,13 +1082,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -965,18 +1094,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1001,93 +1135,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1129,7 +1200,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1152,7 +1223,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1161,21 +1232,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1189,8 +1262,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1206,11 +1281,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1312,14 +1388,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h
index 9d6c262053..a1daef0aaa 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,6 +77,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 48; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -123,7 +125,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -131,9 +133,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -153,34 +157,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f
index 7f7324dc0b..a5edcacd08 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f
index 08dd1f728a..b967ccba14 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         S1=PDG2PDF(LPP(IB(1)),3, IB(1),XBK(IB(1)), QSCALE)
@@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE)
@@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -459,51 +463,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc
new file mode 100644
index 0000000000..04c22fd369
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc
@@ -0,0 +1,427 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 9, 3 },
+    { 3, 9 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/configs.inc
index a4ca4e23a5..3d0bd5df67 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/configs.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/configs.inc
@@ -24,3 +24,5 @@ C     Diagram 2
       DATA (SPROP(I,-3,2),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/2/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f
index 531dfa0771..51ded2dd76 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f
index 5c47e1c729..f350dd008d 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -268,17 +265,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -355,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -399,7 +385,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(1)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -444,23 +431,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /9.000000000000000D+00
-     $ ,3.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  2) /9,6/
 C     1 T(2,1) T(3,4)
-      DATA (CF(I,  2),I=  1,  2) /3.000000000000000D+00
-     $ ,9.000000000000000D+00/
+      DATA (CF(I),I=  3,  3) /9/
 C     1 T(2,4) T(3,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -497,10 +492,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -509,6 +506,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc
index 57246ba1e7..80467846e0 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,9 +99,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -107,10 +110,7 @@ namespace mg5amcCpu
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,43 +169,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -217,7 +273,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -226,14 +281,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -259,14 +317,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -290,7 +344,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -304,7 +357,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -315,6 +367,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -360,154 +416,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_udx_ttxwp()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 9, 3 },
-        { 3, 9 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -579,7 +524,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -613,6 +562,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -655,6 +608,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -775,8 +732,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -784,25 +741,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -815,7 +788,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -825,26 +798,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -857,15 +831,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -902,33 +883,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -947,13 +1082,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -965,18 +1094,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1001,93 +1135,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1129,7 +1200,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1152,7 +1223,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1161,21 +1232,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1189,8 +1262,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1206,11 +1281,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1312,14 +1388,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h
index cd8edd3e39..a193c09aed 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,6 +77,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 48; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -123,7 +125,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -131,9 +133,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -153,34 +157,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f
index 2e439af0a3..817d8f646f 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f
index 0808ce67ce..01d47ba27c 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
         C1=PDG2PDF(LPP(IB(1)),4, IB(1),XBK(IB(1)), QSCALE)
@@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
         DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE)
@@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -459,51 +463,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc
new file mode 100644
index 0000000000..04c22fd369
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc
@@ -0,0 +1,427 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 9, 3 },
+    { 3, 9 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/configs.inc
index fd7f72bff4..2a57ec47a3 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/configs.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/configs.inc
@@ -24,3 +24,5 @@ C     Diagram 2
       DATA (SPROP(I,-3,2),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/2/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f
index 531dfa0771..51ded2dd76 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f
index bbf708250a..357edfe899 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -268,17 +265,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -355,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -399,7 +385,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(1)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -444,23 +431,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /9.000000000000000D+00
-     $ ,3.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  2) /9,6/
 C     1 T(2,1) T(3,4)
-      DATA (CF(I,  2),I=  1,  2) /3.000000000000000D+00
-     $ ,9.000000000000000D+00/
+      DATA (CF(I),I=  3,  3) /9/
 C     1 T(2,4) T(3,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -497,10 +492,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -509,6 +506,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc
index 3261780672..e3a7b6109e 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,9 +99,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -107,10 +110,7 @@ namespace mg5amcCpu
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,43 +169,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -217,7 +273,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -226,14 +281,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -259,14 +317,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -290,7 +344,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -304,7 +357,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -315,6 +367,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -506,156 +562,43 @@ namespace mg5amcCpu
       jamp_sv[3] += 1. / 6. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_dux_ttxwmg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -775,7 +718,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +757,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +804,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -973,8 +928,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +937,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +984,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +994,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1055,15 +1027,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,33 +1079,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1145,13 +1278,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,18 +1290,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1199,93 +1331,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1396,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1419,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,21 +1428,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1387,8 +1458,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1477,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1584,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h
index ecb184f729..582051038f 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,6 +77,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -123,7 +125,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -131,9 +133,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -153,34 +157,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f
index 26d6979a1d..7b6075d0bd 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f
index 330b566ed8..80270f0371 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         S1=PDG2PDF(LPP(IB(1)),3, IB(1),XBK(IB(1)), QSCALE)
@@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE)
@@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -459,51 +463,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/configs.inc
index 137b6b3695..b386c37679 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/configs.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/configs.inc
@@ -180,3 +180,5 @@ C     Diagram 12
       DATA (SPROP(I,-4,12),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/12/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f
index d8518f17f7..439883b7b1 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f
index 4b8ccfcacb..9eade535f2 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(2,1) T(6,3,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(2,4) T(6,3,1)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(3,1) T(6,2,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(3,4) T(6,2,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc
index c933a8f276..dc3d8b4896 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,9 +99,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -107,10 +110,7 @@ namespace mg5amcCpu
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,43 +169,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -217,7 +273,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -226,14 +281,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -259,14 +317,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -290,7 +344,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -304,7 +357,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -315,6 +367,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -506,156 +562,43 @@ namespace mg5amcCpu
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gd_ttxwmu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -775,7 +718,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +757,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +804,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -973,8 +928,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +937,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +984,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +994,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1055,15 +1027,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,33 +1079,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1145,13 +1278,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,18 +1290,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1199,93 +1331,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1396,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1419,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,21 +1428,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1387,8 +1458,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1477,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1584,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h
index a5c44d3213..1510b6bae5 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,6 +77,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -123,7 +125,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -131,9 +133,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -153,34 +157,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f
index 3779397ce4..e19077b3dc 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f
index 1dae307565..4ce490707d 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         S2=PDG2PDF(LPP(IB(2)),3, IB(2),XBK(IB(2)), QSCALE)
@@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -456,51 +460,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/configs.inc
index 4cdcf03d63..03c4795328 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/configs.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/configs.inc
@@ -174,3 +174,5 @@ C     Diagram 12
       DATA (SPROP(I,-4,12),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/12/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f
index d8518f17f7..439883b7b1 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f
index a3a57cd8b8..e520ea078c 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,3,2) T(6,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,3,4) T(6,2)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,6,2) T(3,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,6,4) T(3,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc
index 6f1f37d1eb..e2a0d50b47 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,9 +99,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -107,10 +110,7 @@ namespace mg5amcCpu
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,43 +169,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -217,7 +273,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -226,14 +281,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -259,14 +317,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -290,7 +344,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -304,7 +357,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -315,6 +367,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -506,156 +562,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gdx_ttxwpux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -775,7 +718,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +757,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +804,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -973,8 +928,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +937,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +984,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +994,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1055,15 +1027,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,33 +1079,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1145,13 +1278,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,18 +1290,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1199,93 +1331,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1396,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1419,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,21 +1428,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1387,8 +1458,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1477,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1584,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h
index d0dd16c512..28103f2454 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,6 +77,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -123,7 +125,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -131,9 +133,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -153,34 +157,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f
index 7c1bbde100..181a9c7408 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f
index ece4509a8c..e6c9ab31c6 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
         DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE)
@@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -456,51 +460,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/configs.inc
index 54530d6f24..b65b28a284 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/configs.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/configs.inc
@@ -174,3 +174,5 @@ C     Diagram 12
       DATA (SPROP(I,-4,12),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/12/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f
index d8518f17f7..439883b7b1 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f
index e550640e16..e06cd80f95 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,2,4) T(3,6)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,2,6) T(3,4)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,3,4) T(2,6)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,3,6) T(2,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc
index 16d1e89a53..3524120821 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,9 +99,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -107,10 +110,7 @@ namespace mg5amcCpu
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,43 +169,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -217,7 +273,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -226,14 +281,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -259,14 +317,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -290,7 +344,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -304,7 +357,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -315,6 +367,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -506,156 +562,43 @@ namespace mg5amcCpu
       jamp_sv[3] -= 1. / 2. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gu_ttxwpd()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -775,7 +718,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +757,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +804,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -973,8 +928,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +937,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +984,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +994,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1055,15 +1027,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,33 +1079,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1145,13 +1278,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,18 +1290,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1199,93 +1331,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1396,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1419,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,21 +1428,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1387,8 +1458,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1477,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1584,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h
index f799f32129..3be1db3774 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,6 +77,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -123,7 +125,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -131,9 +133,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -153,34 +157,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f
index e5ddbf348a..f1c5e0251f 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f
index 4ebece2e78..3bde9a0625 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE)
         C2=PDG2PDF(LPP(IB(2)),4, IB(2),XBK(IB(2)), QSCALE)
@@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -456,51 +460,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/configs.inc
index 7767ae3d5e..d95072bf21 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/configs.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/configs.inc
@@ -174,3 +174,5 @@ C     Diagram 12
       DATA (SPROP(I,-4,12),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/12/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f
index d8518f17f7..439883b7b1 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f
index 738301d049..4cc5183dce 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,3,2) T(6,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,3,4) T(6,2)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,6,2) T(3,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,6,4) T(3,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc
index 41a6e0002f..4688e54d18 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,9 +99,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -107,10 +110,7 @@ namespace mg5amcCpu
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,43 +169,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -217,7 +273,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -226,14 +281,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -259,14 +317,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -290,7 +344,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -304,7 +357,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -315,6 +367,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -506,156 +562,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= 1. / 6. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gux_ttxwmdx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -775,7 +718,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +757,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +804,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -973,8 +928,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +937,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +984,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +994,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1055,15 +1027,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,33 +1079,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1145,13 +1278,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,18 +1290,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1199,93 +1331,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1396,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1419,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,21 +1428,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1387,8 +1458,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1477,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1584,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h
index b6253b6715..aaf804d7b8 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,6 +77,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -123,7 +125,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -131,9 +133,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -153,34 +157,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f
index 8e03eed7eb..e58319a9cb 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f
index 9d0ddcecfc..2361e40053 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE)
@@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -456,51 +460,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/configs.inc
index 5b08a7cb7c..644de652d9 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/configs.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/configs.inc
@@ -174,3 +174,5 @@ C     Diagram 12
       DATA (SPROP(I,-4,12),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/12/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f
index d8518f17f7..439883b7b1 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f
index 6b3ff14d2d..676bb91921 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,2,4) T(3,6)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,2,6) T(3,4)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,3,4) T(2,6)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,3,6) T(2,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc
index f90db593a9..c5be0b0677 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,9 +99,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -107,10 +110,7 @@ namespace mg5amcCpu
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,43 +169,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -217,7 +273,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -226,14 +281,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -259,14 +317,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -290,7 +344,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -304,7 +357,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -315,6 +367,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -506,156 +562,43 @@ namespace mg5amcCpu
       jamp_sv[3] += 1. / 6. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_udx_ttxwpg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -775,7 +718,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +757,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +804,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -973,8 +928,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +937,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +984,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +994,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1055,15 +1027,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,33 +1079,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1145,13 +1278,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,18 +1290,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1199,93 +1331,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1396,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1419,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,21 +1428,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1387,8 +1458,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1477,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1584,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h
index b4a0ccb74d..fc664f5841 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,6 +77,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -123,7 +125,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -131,9 +133,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -153,34 +157,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f
index 7e750641c8..1ca5709cc3 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f
index 28ad0eed08..ca38b13683 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
         C1=PDG2PDF(LPP(IB(1)),4, IB(1),XBK(IB(1)), QSCALE)
@@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
         DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE)
@@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -459,51 +463,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/configs.inc
index 939cb376b9..d418740afe 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/configs.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/configs.inc
@@ -180,3 +180,5 @@ C     Diagram 12
       DATA (SPROP(I,-4,12),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/12/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f
index d8518f17f7..439883b7b1 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f
index 536bec2827..f501bedaee 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(2,1) T(6,3,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(2,4) T(6,3,1)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(3,1) T(6,2,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(3,4) T(6,2,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent b/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h
index 850b86e0e6..9d6ce139ee 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc
index d799b19eeb..cbce3f44c5 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h
index e448052141..0fbfb533e9 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index 0a0d056033..a93dec7f6c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +56,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0064830780029296875 [0m
+[1;32mDEBUG: model prefixing  takes 0.005368709564208984 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -168,7 +167,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~
 INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ 
 INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ 
 INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ 
-5 processes with 7 diagrams generated in 0.030 s
+5 processes with 7 diagrams generated in 0.032 s
 Total: 5 processes with 7 diagrams
 add process p p > t t~ j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -374,21 +373,21 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.933 s
+65 processes with 1119 diagrams generated in 1.876 s
 Total: 83 processes with 1202 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_pp_tt012j 
 INFO: remove old information in CODEGEN_mad_pp_tt012j 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Processing color information for process: g g > t t~ g g @2 
@@ -499,9 +498,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -510,9 +509,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -521,9 +520,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -532,9 +531,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -543,9 +542,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -554,9 +553,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -565,9 +564,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -576,9 +575,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -587,9 +586,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -598,9 +597,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -609,9 +608,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -620,9 +619,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -631,9 +630,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -642,9 +641,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -653,9 +652,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -664,9 +663,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -675,9 +674,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1577][0m [0m
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -686,25 +685,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 1 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.286 s
-Wrote files for 810 helas calls in 2.762 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 1 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.276 s
+Wrote files for 810 helas calls in 3.216 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.340 s
+ALOHA: aloha creates 5 routines in  0.337 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.315 s
+ALOHA: aloha creates 10 routines in  0.332 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -717,120 +716,32 @@ ALOHA: aloha creates 10 routines in  0.315 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_uux_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 230 (offset 3 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #2 succeeded at 243 (offset 16 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 246 (offset 19 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 246 (offset 19 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_uux_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 246 (offset 19 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #2 succeeded at 275 (offset 48 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gu_ttxgu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gux_ttxgux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uc_ttxuc; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 76 (offset 5 lines).
-Hunk #2 succeeded at 280 (offset 53 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_ucx_ttxucx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 82 (offset 11 lines).
-Hunk #2 succeeded at 286 (offset 59 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uu_ttxuu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxccx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 82 (offset 11 lines).
-Hunk #2 succeeded at 286 (offset 59 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxcx_ttxuxcx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 76 (offset 5 lines).
-Hunk #2 succeeded at 280 (offset 53 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxux_ttxuxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #1 succeeded at 74 (offset 3 lines).
-Hunk #2 succeeded at 278 (offset 51 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m11.258s
-user	0m9.633s
-sys	0m0.984s
-Code generation completed in 12 seconds
+real	0m13.682s
+user	0m11.871s
+sys	0m1.595s
+Code generation completed in 14 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -843,7 +754,7 @@ Code generation completed in 12 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -851,10 +762,9 @@ Code generation completed in 12 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -873,7 +783,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -881,10 +791,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT
+++ b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
index 33311e49bc..92b8989f46 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
index 5eb60f35df..fe9c38d826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
@@ -125,6 +125,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
index 38810a6b83..0185201786 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
@@ -125,6 +125,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f b/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc b/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/makefile b/epochX/cudacpp/pp_tt012j.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/makefile
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc b/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc
index 2588190439..e169c1f193 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..2fa0ce29e0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
index a17c5f1eef..1d67401043 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -368,154 +424,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -555,7 +500,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -588,6 +537,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -628,6 +581,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -748,8 +705,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -757,25 +714,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -788,7 +761,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -798,26 +771,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -830,15 +804,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -875,33 +856,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -920,13 +1055,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -938,18 +1067,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -974,93 +1108,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1102,7 +1173,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1125,7 +1196,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1134,21 +1205,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1162,8 +1235,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1179,11 +1254,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1285,14 +1361,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
index 2d89e0e244..c4a9fe53db 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
index 19278bca59..f2058f757e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
index 42cc7c9d61..325bd60fb1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b68b9250fd
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc
@@ -0,0 +1,427 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/configs.inc
index 99d3eecc56..0dbac30825 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/configs.inc
@@ -24,3 +24,5 @@ C     Diagram 3
       DATA (SPROP(I,-2,3),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/3/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f
index ec5722702a..30cca27587 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
index ca1785b808..a2d45dc02c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -227,17 +224,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -307,7 +293,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -350,7 +336,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -393,21 +380,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /5.333333333333333D+00,
-     $ -6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  2) /16,-4/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  2) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
+      DATA (CF(I),I=  3,  3) /16/
 C     1 T(2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -446,10 +436,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -458,6 +450,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
index 0979455d7a..24c9be9271 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -345,154 +401,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= 1. / 2. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_uux_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 9, 3 },
-        { 3, 9 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -532,7 +477,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -565,6 +514,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -605,6 +558,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -725,8 +682,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -734,25 +691,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -765,7 +738,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -775,26 +748,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -807,15 +781,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -852,33 +833,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -897,13 +1032,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -915,18 +1044,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -951,93 +1085,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1079,7 +1150,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1102,7 +1173,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1111,21 +1182,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1139,8 +1212,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1156,11 +1231,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1262,14 +1338,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
index d6fa3205c0..b2f1c18fba 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 1; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
index 6558c40922..cfdb6645ac 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
index 86f844defe..779ad4cdc1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc
new file mode 100644
index 0000000000..04c22fd369
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc
@@ -0,0 +1,427 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 9, 3 },
+    { 3, 9 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/configs.inc
index a99b3c9fba..ef48c8df8d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/configs.inc
@@ -6,3 +6,5 @@ C     Diagram 1
       DATA TPRID(-1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/1/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f
index ec5722702a..30cca27587 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
index ec88a303fa..34923f2e60 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -230,17 +227,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -310,7 +296,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -356,7 +342,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -399,21 +386,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /9.000000000000000D+00
-     $ ,3.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  2) /9,6/
 C     1 T(2,1) T(3,4)
-      DATA (CF(I,  2),I=  1,  2) /3.000000000000000D+00
-     $ ,9.000000000000000D+00/
+      DATA (CF(I),I=  3,  3) /9/
 C     1 T(2,4) T(3,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -444,10 +434,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -456,6 +448,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index 5de1c626c8..037b031386 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -567,158 +623,43 @@ namespace mg5amcCpu
       jamp_sv[5] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 64, -8, -8, 1, 1, 10 },
-        { -8, 64, 1, 10, -8, 1 },
-        { -8, 1, 64, -8, 10, 1 },
-        { 1, 10, -8, 64, 1, -8 },
-        { 1, -8, 10, 1, 64, -8 },
-        { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -774,7 +715,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -808,6 +753,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -849,6 +798,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -969,8 +922,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -978,25 +931,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1009,7 +978,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1019,26 +988,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1051,15 +1021,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1096,33 +1073,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1141,13 +1272,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1159,18 +1284,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1195,93 +1325,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1323,7 +1390,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1346,7 +1413,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1355,21 +1422,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1383,8 +1452,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1400,11 +1471,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1506,14 +1578,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 2acfa000a7..69d8ea8b08 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
index 10496aa04d..19937ed005 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
index 7c8695090c..9e5f9c9b0a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc
new file mode 100644
index 0000000000..9e3ce9d917
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc
@@ -0,0 +1,431 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 64, -8, -8, 1, 1, 10 },
+    { -8, 64, 1, 10, -8, 1 },
+    { -8, 1, 64, -8, 10, 1 },
+    { 1, 10, -8, 64, 1, -8 },
+    { 1, -8, 10, 1, 64, -8 },
+    { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/configs.inc
index 1eb9c578f9..a3ad3e22cf 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/configs.inc
@@ -171,3 +171,5 @@ C     Diagram 15
       DATA (SPROP(I,-3,15),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/15/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f
index c2eadb2c31..aa93a3d195 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
index 797b19405d..48e24320cc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -243,17 +240,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -323,7 +309,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -366,7 +352,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(9)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -409,43 +396,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /7.111111111111111D+00,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D
-     $ +00/
+      DATA DENOM/9/
+      DATA (CF(I),I=  1,  6) /64,-16,-16,2,2,20/
 C     1 T(1,2,5,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D
-     $ +00,-8.888888888888888D-01,1.111111111111111D-01/
+      DATA (CF(I),I=  7, 11) /64,2,20,-16,2/
 C     1 T(1,5,2,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-8.888888888888888D-01
-     $ ,1.111111111111111D-01,7.111111111111111D+00,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01/
+      DATA (CF(I),I= 12, 15) /64,-16,20,2/
 C     1 T(2,1,5,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.111111111111111D-01
-     $ ,1.111111111111111D+00,-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,
-     $ -8.888888888888888D-01/
+      DATA (CF(I),I= 16, 18) /64,2,-16/
 C     1 T(2,5,1,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.111111111111111D-01,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01,7.111111111111111D+00,-8.888888888888888D-01/
+      DATA (CF(I),I= 19, 20) /64,-16/
 C     1 T(5,1,2,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.111111111111111D+00
-     $ ,1.111111111111111D-01,1.111111111111111D-01,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,7.111111111111111D+00/
+      DATA (CF(I),I= 21, 21) /64/
 C     1 T(5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -549,10 +525,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -561,6 +539,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index 4f8f49270b..c90527fa03 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -405,156 +461,43 @@ namespace mg5amcCpu
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -610,7 +553,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -644,6 +591,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -685,6 +636,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -805,8 +760,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -814,25 +769,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -845,7 +816,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -855,26 +826,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -887,15 +859,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -932,33 +911,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -977,13 +1110,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -995,18 +1122,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1031,93 +1163,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1159,7 +1228,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1182,7 +1251,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1191,21 +1260,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1219,8 +1290,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1236,11 +1309,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1342,14 +1416,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index b501a9772e..2c0025c7b9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
index b0cc58e89c..340d51dbfa 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
index 2b281a8200..83f5f0b209 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/configs.inc
index 225cf5aca4..0a6b8dbc07 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/configs.inc
@@ -57,3 +57,5 @@ C     Diagram 5
       DATA (SPROP(I,-3,5),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/5/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f
index c2eadb2c31..aa93a3d195 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
index 9394a561b8..8aa675cd01 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -246,17 +243,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -326,7 +312,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -372,7 +358,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -415,31 +402,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,3,2) T(5,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,3,4) T(5,2)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,5,2) T(3,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,5,4) T(3,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -492,10 +476,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -504,6 +490,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index e2d65a2667..812f8dec18 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -405,156 +461,43 @@ namespace mg5amcCpu
       jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -610,7 +553,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -644,6 +591,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -685,6 +636,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -805,8 +760,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -814,25 +769,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -845,7 +816,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -855,26 +826,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -887,15 +859,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -932,33 +911,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -977,13 +1110,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -995,18 +1122,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1031,93 +1163,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1159,7 +1228,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1182,7 +1251,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1191,21 +1260,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1219,8 +1290,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1236,11 +1309,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1342,14 +1416,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index d658e0394e..7a811e35e9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
index e36675626f..f9cde14dc2 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
index 61bb13c3e7..136c6cded7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/configs.inc
index 693e4354b0..28a94fd35a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/configs.inc
@@ -57,3 +57,5 @@ C     Diagram 5
       DATA (SPROP(I,-3,5),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/5/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f
index c2eadb2c31..aa93a3d195 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
index c7fdad381b..f77432fcd1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -246,17 +243,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -326,7 +312,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -372,7 +358,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -415,31 +402,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,2,4) T(3,5)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,2,5) T(3,4)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,3,4) T(2,5)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,3,5) T(2,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -492,10 +476,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -504,6 +490,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
index 4f41927bc9..e7e58d3385 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -405,156 +461,43 @@ namespace mg5amcCpu
       jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_uux_ttxg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -610,7 +553,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -644,6 +591,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -685,6 +636,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -805,8 +760,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -814,25 +769,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -845,7 +816,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -855,26 +826,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -887,15 +859,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -932,33 +911,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -977,13 +1110,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -995,18 +1122,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1031,93 +1163,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1159,7 +1228,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1182,7 +1251,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1191,21 +1260,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1219,8 +1290,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1236,11 +1309,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1342,14 +1416,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
index ebf14aca9e..013d386f6c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
index d46dad4fcb..f43ba8ff39 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
index d8e94d91bb..76b1a9dd93 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc
new file mode 100644
index 0000000000..42eca2f7c9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/configs.inc
index 897255fa04..907b407e8e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/configs.inc
@@ -51,3 +51,5 @@ C     Diagram 5
       DATA TPRID(-2,5)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/5/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f
index c2eadb2c31..aa93a3d195 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
index 787dae76b2..7dc0b8e911 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -246,17 +243,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -326,7 +312,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -372,7 +358,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -415,31 +402,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(2,1) T(5,3,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(2,4) T(5,3,1)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(3,1) T(5,2,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(3,4) T(5,2,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -492,10 +476,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -504,6 +490,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
index da962495fd..1721f42b1f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 24;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -2461,176 +2517,43 @@ namespace mg5amcCpu
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_gg_ttxgg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
-
-      // The color matrix (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
-        { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
-        { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
-        { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
-        { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
-        { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
-        { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
-        { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
-        { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
-        { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
-        { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
-        { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
-        { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
-        { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
-        { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
-        { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
-        { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
-        { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
-        { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
-        { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
-        { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
-        { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
-        { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
-        { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -2718,7 +2641,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -2753,6 +2680,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -2795,6 +2726,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -2915,8 +2850,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -2924,25 +2859,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -2955,7 +2906,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -2965,26 +2916,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -2997,15 +2949,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -3042,33 +3001,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -3087,13 +3200,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -3105,18 +3212,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -3141,93 +3253,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -3269,7 +3318,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -3292,7 +3341,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -3301,21 +3350,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -3329,8 +3380,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -3346,11 +3399,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -3452,14 +3506,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
index b6e3ba16d4..65b3e1d2ac 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
index 850bc73f22..23a723f0df 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
index 7af9753fb7..1e6337aaac 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc
new file mode 100644
index 0000000000..91a7f9998e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc
@@ -0,0 +1,449 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
+
+  // The color matrix (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
+    { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
+    { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
+    { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
+    { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
+    { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
+    { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
+    { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
+    { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
+    { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
+    { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
+    { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
+    { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
+    { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
+    { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
+    { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
+    { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
+    { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
+    { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
+    { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
+    { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
+    { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
+    { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
+    { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/configs.inc
index b50d3d5335..570419b5c0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/configs.inc
@@ -1530,3 +1530,5 @@ C     Diagram 105
       DATA (SPROP(I,-4,105),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/105/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
index 39ecff768a..ec8440191f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -275,17 +272,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -355,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -398,7 +384,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(155)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -441,407 +428,81 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I,  1),I=  7, 12) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  1),I= 13, 18) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  1),I= 19, 24) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
+      DATA DENOM/54/
+      DATA (CF(I),I=  1, 24) /512,-128,-128,16,16,160,-128,16,16,-2,-2
+     $ ,-20,16,-2,160,-20,142,124,-2,-20,-20,124,124,-56/
 C     1 T(1,2,5,6,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I,  2),I=  7, 12) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  2),I= 13, 18) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I,  2),I= 19, 24) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
+      DATA (CF(I),I= 25, 47) /512,16,160,-128,16,16,-128,-2,-20,16,-2,
+     $ -2,-20,-20,124,124,-56,16,-2,160,-20,142,124/
 C     1 T(1,2,6,5,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I,  3),I=  7, 12) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  3),I= 13, 18) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  3),I= 19, 24) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
+      DATA (CF(I),I= 48, 69) /512,-128,160,16,16,-2,160,-20,142,124,
+     $ -128,16,16,-2,-2,-20,-20,-2,124,-56,-20,124/
 C     1 T(1,5,2,6,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I,  4),I=  7, 12) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I,  4),I= 13, 18) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  4),I= 19, 24) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
+      DATA (CF(I),I= 70, 90) /512,16,-128,-2,-20,-20,124,124,-56,16,
+     $ -128,-2,-20,16,-2,-2,16,142,124,160,-20/
 C     1 T(1,5,6,2,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I,  5),I=  7, 12) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I,  5),I= 13, 18) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  5),I= 19, 24) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I= 91,110) /512,-128,-2,16,142,124,160,-20,-20,-2
+     $ ,124,-56,-20,124,-128,16,16,-2,-2,-20/
 C     1 T(1,6,2,5,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I,  6),I=  7, 12) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  6),I= 13, 18) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I,  6),I= 19, 24) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=111,129) /512,-20,-2,124,-56,-20,124,-2,16,142,124
+     $ ,160,-20,16,-128,-2,-20,16,-2/
 C     1 T(1,6,5,2,3,4)
-      DATA (CF(I,  7),I=  1,  6) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  7),I=  7, 12) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I,  7),I= 13, 18) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I,  7),I= 19, 24) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
+      DATA (CF(I),I=130,147) /512,-128,-128,16,16,160,160,-20,16,-2
+     $ ,124,142,-20,124,-2,-20,-56,124/
 C     1 T(2,1,5,6,3,4)
-      DATA (CF(I,  8),I=  1,  6) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  8),I=  7, 12) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I,  8),I= 13, 18) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  8),I= 19, 24) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
+      DATA (CF(I),I=148,164) /512,16,160,-128,16,-20,124,-2,-20,-56
+     $ ,124,160,-20,16,-2,124,142/
 C     1 T(2,1,6,5,3,4)
-      DATA (CF(I,  9),I=  1,  6) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  9),I=  7, 12) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I,  9),I= 13, 18) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  9),I= 19, 24) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I=165,180) /512,-128,160,16,16,-2,-128,16,-20,-2,124
+     $ ,-56,-20,-2,124,-20/
 C     1 T(2,5,1,6,3,4)
-      DATA (CF(I, 10),I=  1,  6) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I, 10),I=  7, 12) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 10),I= 13, 18) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 10),I= 19, 24) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
+      DATA (CF(I),I=181,195) /512,16,-128,-2,-20,16,-128,-2,16,142,124
+     $ ,-2,16,-20,160/
 C     1 T(2,5,6,1,3,4)
-      DATA (CF(I, 11),I=  1,  6) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I, 11),I=  7, 12) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I, 11),I= 13, 18) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 11),I= 19, 24) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=196,209) /512,-128,124,-56,-20,-2,124,-20,16,-2,
+     $ -128,16,-20,-2/
 C     1 T(2,6,1,5,3,4)
-      DATA (CF(I, 12),I=  1,  6) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 12),I=  7, 12) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I, 12),I= 13, 18) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 12),I= 19, 24) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=210,222) /512,142,124,-2,16,-20,160,-2,-20,16,-128
+     $ ,-2,16/
 C     1 T(2,6,5,1,3,4)
-      DATA (CF(I, 13),I=  1,  6) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 13),I=  7, 12) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I, 13),I= 13, 18) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I, 13),I= 19, 24) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I=223,234) /512,-128,-128,16,16,160,124,-20,-56,124,
+     $ -2,-20/
 C     1 T(5,1,2,6,3,4)
-      DATA (CF(I, 14),I=  1,  6) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 14),I=  7, 12) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 14),I= 13, 18) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I, 14),I= 19, 24) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
+      DATA (CF(I),I=235,245) /512,16,160,-128,16,-20,160,124,142,16,-2/
 C     1 T(5,1,6,2,3,4)
-      DATA (CF(I, 15),I=  1,  6) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I, 15),I=  7, 12) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 15),I= 13, 18) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I, 15),I= 19, 24) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=246,255) /512,-128,160,16,-56,124,124,-20,-20,-2/
 C     1 T(5,2,1,6,3,4)
-      DATA (CF(I, 16),I=  1,  6) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 16),I=  7, 12) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 16),I= 13, 18) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 16),I= 19, 24) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=256,264) /512,16,-128,124,142,-20,160,-2,16/
 C     1 T(5,2,6,1,3,4)
-      DATA (CF(I, 17),I=  1,  6) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 17),I=  7, 12) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 17),I= 13, 18) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I, 17),I= 19, 24) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=265,272) /512,-128,-2,16,-20,-2,-128,16/
 C     1 T(5,6,1,2,3,4)
-      DATA (CF(I, 18),I=  1,  6) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 18),I=  7, 12) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 18),I= 13, 18) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I, 18),I= 19, 24) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
+      DATA (CF(I),I=273,279) /512,-20,-2,-2,16,16,-128/
 C     1 T(5,6,2,1,3,4)
-      DATA (CF(I, 19),I=  1,  6) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 19),I=  7, 12) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 19),I= 13, 18) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 19),I= 19, 24) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
+      DATA (CF(I),I=280,285) /512,-128,-128,16,16,160/
 C     1 T(6,1,2,5,3,4)
-      DATA (CF(I, 20),I=  1,  6) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 20),I=  7, 12) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 20),I= 13, 18) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 20),I= 19, 24) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
+      DATA (CF(I),I=286,290) /512,16,160,-128,16/
 C     1 T(6,1,5,2,3,4)
-      DATA (CF(I, 21),I=  1,  6) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 21),I=  7, 12) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 21),I= 13, 18) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 21),I= 19, 24) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
+      DATA (CF(I),I=291,294) /512,-128,160,16/
 C     1 T(6,2,1,5,3,4)
-      DATA (CF(I, 22),I=  1,  6) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 22),I=  7, 12) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 22),I= 13, 18) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 22),I= 19, 24) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
+      DATA (CF(I),I=295,297) /512,16,-128/
 C     1 T(6,2,5,1,3,4)
-      DATA (CF(I, 23),I=  1,  6) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 23),I=  7, 12) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 23),I= 13, 18) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 23),I= 19, 24) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
+      DATA (CF(I),I=298,299) /512,-128/
 C     1 T(6,5,1,2,3,4)
-      DATA (CF(I, 24),I=  1,  6) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 24),I=  7, 12) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 24),I= 13, 18) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 24),I= 19, 24) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
+      DATA (CF(I),I=300,300) /512/
 C     1 T(6,5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -1547,10 +1208,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -1559,6 +1222,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(2)=AMP2(2)+AMP(4)*DCONJG(AMP(4))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
index 70d0f7cb8e..c8b71f5ba4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -868,164 +924,43 @@ namespace mg5amcCpu
       jamp_sv[9] += 1. / 2. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_gg_ttxuux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
-        { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
-        { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
-        { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
-        { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
-        { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
-        { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
-        { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
-        { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
-        { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
-        { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
-        { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -1113,7 +1048,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1148,6 +1087,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1190,6 +1133,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1310,8 +1257,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1319,25 +1266,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1350,7 +1313,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1360,26 +1323,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1392,15 +1356,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1437,33 +1408,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1482,13 +1607,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1500,18 +1619,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1536,93 +1660,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1664,7 +1725,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1687,7 +1748,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1696,21 +1757,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1724,8 +1787,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1741,11 +1806,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1847,14 +1913,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
index 84a8066974..d142f229d3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
index 49cac7230f..daa43b594b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
index 6e1c3f774f..747f5861c7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -305,6 +305,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -388,12 +392,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -475,51 +479,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc
new file mode 100644
index 0000000000..767405ac3b
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc
@@ -0,0 +1,437 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
+    { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
+    { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
+    { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
+    { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
+    { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
+    { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
+    { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
+    { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
+    { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
+    { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
+    { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/configs.inc
index d6f8bae63a..0fcb4cf404 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/configs.inc
@@ -510,3 +510,5 @@ C     Diagram 35
       DATA (SPROP(I,-4,35),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/35/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
index 9fb8f4d180..80e3731885 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(17)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,111 +434,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,0.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  1),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1, 12) /48,32,32,12,0,32,-4,0,-12,-4,-4,12/
 C     1 T(1,2,3,4) T(5,6)
-      DATA (CF(I,  2),I=  1,  6) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,0.000000000000000D+00/
-      DATA (CF(I,  2),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,-4,-12,12,-4/
 C     1 T(1,2,3,6) T(5,4)
-      DATA (CF(I,  3),I=  1,  6) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
-      DATA (CF(I,  3),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,12,-12,-4/
 C     1 T(1,2,5,4) T(3,6)
-      DATA (CF(I,  4),I=  1,  6) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,0.000000000000000D+00,-6.666666666666666D-01/
-      DATA (CF(I,  4),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00/
+      DATA (CF(I),I= 34, 42) /48,0,-4,32,0,12,-4,-4,-12/
 C     1 T(1,2,5,6) T(3,4)
-      DATA (CF(I,  5),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00/
-      DATA (CF(I,  5),I=  7, 12) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA (CF(I),I= 43, 50) /48,32,32,12,0,-4,32,0/
 C     1 T(1,3,4) T(2,5,6)
-      DATA (CF(I,  6),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01/
-      DATA (CF(I,  6),I=  7, 12) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 51, 57) /48,12,32,-4,0,0,32/
 C     1 T(1,3,6) T(2,5,4)
-      DATA (CF(I,  7),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
-      DATA (CF(I,  7),I=  7, 12) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA (CF(I),I= 58, 63) /48,32,32,0,0,-4/
 C     1 T(1,5,4) T(2,3,6)
-      DATA (CF(I,  8),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,2.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  8),I=  7, 12) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA (CF(I),I= 64, 68) /48,0,32,-4,0/
 C     1 T(1,5,6) T(2,3,4)
-      DATA (CF(I,  9),I=  1,  6) /-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  9),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I= 69, 72) /48,32,32,12/
 C     1 T(2,1,3,4) T(5,6)
-      DATA (CF(I, 10),I=  1,  6) /-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,0.000000000000000D+00/
-      DATA (CF(I, 10),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,2.000000000000000D+00,5.333333333333333D+00/
+      DATA (CF(I),I= 73, 75) /48,12,32/
 C     1 T(2,1,3,6) T(5,4)
-      DATA (CF(I, 11),I=  1,  6) /-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
-      DATA (CF(I, 11),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
+      DATA (CF(I),I= 76, 77) /48,32/
 C     1 T(2,1,5,4) T(3,6)
-      DATA (CF(I, 12),I=  1,  6) /2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
-      DATA (CF(I, 12),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
+      DATA (CF(I),I= 78, 78) /48/
 C     1 T(2,1,5,6) T(3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -760,10 +680,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -772,6 +694,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
index ac4bf091b7..a7827dbfab 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -868,164 +924,43 @@ namespace mg5amcCpu
       jamp_sv[10] -= 1. / 2. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_gu_ttxgu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, 16, 0, 16, -2, 0, 16, 6, 0, 16, 0, -2 },
-        { 16, 48, 16, 0, 0, -2, 6, 16, 16, 0, -2, 0 },
-        { 0, 16, 48, 16, 16, 6, -2, 0, 6, -2, -6, -2 },
-        { 16, 0, 16, 48, 6, 16, 0, -2, -2, 6, -2, -6 },
-        { -2, 0, 16, 6, 48, 16, 0, 16, -2, -6, -2, 6 },
-        { 0, -2, 6, 16, 16, 48, 16, 0, -6, -2, 6, -2 },
-        { 16, 6, -2, 0, 0, 16, 48, 16, -2, 0, 16, 0 },
-        { 6, 16, 0, -2, 16, 0, 16, 48, 0, -2, 0, 16 },
-        { 0, 16, 6, -2, -2, -6, -2, 0, 48, 16, 6, 16 },
-        { 16, 0, -2, 6, -6, -2, 0, -2, 16, 48, 16, 6 },
-        { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 },
-        { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -1113,7 +1048,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1148,6 +1087,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1190,6 +1133,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1310,8 +1257,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1319,25 +1266,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1350,7 +1313,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1360,26 +1323,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1392,15 +1356,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1437,33 +1408,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1482,13 +1607,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1500,18 +1619,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1536,93 +1660,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1664,7 +1725,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1687,7 +1748,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1696,21 +1757,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1724,8 +1787,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1741,11 +1806,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1847,14 +1913,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
index f75309f403..35718f5b21 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
index 47e378e255..e363f036a8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
index 756e98881c..7481a1ea65 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc
new file mode 100644
index 0000000000..db09ae848e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc
@@ -0,0 +1,437 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, 16, 0, 16, -2, 0, 16, 6, 0, 16, 0, -2 },
+    { 16, 48, 16, 0, 0, -2, 6, 16, 16, 0, -2, 0 },
+    { 0, 16, 48, 16, 16, 6, -2, 0, 6, -2, -6, -2 },
+    { 16, 0, 16, 48, 6, 16, 0, -2, -2, 6, -2, -6 },
+    { -2, 0, 16, 6, 48, 16, 0, 16, -2, -6, -2, 6 },
+    { 0, -2, 6, 16, 16, 48, 16, 0, -6, -2, 6, -2 },
+    { 16, 6, -2, 0, 0, 16, 48, 16, -2, 0, 16, 0 },
+    { 6, 16, 0, -2, 16, 0, 16, 48, 0, -2, 0, 16 },
+    { 0, 16, 6, -2, -2, -6, -2, 0, 48, 16, 6, 16 },
+    { 16, 0, -2, 6, -6, -2, 0, -2, 16, 48, 16, 6 },
+    { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 },
+    { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/configs.inc
index b2af8a7144..6b1cf30883 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/configs.inc
@@ -510,3 +510,5 @@ C     Diagram 35
       DATA (SPROP(I,-4,35),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/35/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
index 0079f40417..099c6ca7c5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(17)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,111 +434,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
-      DATA (CF(I,  1),I=  7, 12) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1, 12) /48,32,0,32,-4,0,32,12,0,32,0,-4/
 C     1 T(1,3,2) T(5,6,4)
-      DATA (CF(I,  2),I=  1,  6) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
-      DATA (CF(I,  2),I=  7, 12) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA (CF(I),I= 13, 23) /48,32,0,0,-4,12,32,32,0,-4,0/
 C     1 T(1,3,4) T(5,6,2)
-      DATA (CF(I,  3),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
-      DATA (CF(I,  3),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 24, 33) /48,32,32,12,-4,0,12,-4,-12,-4/
 C     1 T(1,5,3,2) T(6,4)
-      DATA (CF(I,  4),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,2.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  4),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-6.666666666666666D-01,
-     $ -2.000000000000000D+00/
+      DATA (CF(I),I= 34, 42) /48,12,32,0,-4,-4,12,-4,-12/
 C     1 T(1,5,3,4) T(6,2)
-      DATA (CF(I,  5),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
-      DATA (CF(I,  5),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
+      DATA (CF(I),I= 43, 50) /48,32,0,32,-4,-12,-4,12/
 C     1 T(1,5,6,2) T(3,4)
-      DATA (CF(I,  6),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
-      DATA (CF(I,  6),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 51, 57) /48,32,0,-12,-4,12,-4/
 C     1 T(1,5,6,4) T(3,2)
-      DATA (CF(I,  7),I=  1,  6) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
-      DATA (CF(I,  7),I=  7, 12) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA (CF(I),I= 58, 63) /48,32,-4,0,32,0/
 C     1 T(1,6,2) T(5,3,4)
-      DATA (CF(I,  8),I=  1,  6) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
-      DATA (CF(I,  8),I=  7, 12) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,0.000000000000000D+00,
-     $ -6.666666666666666D-01,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 64, 68) /48,0,-4,0,32/
 C     1 T(1,6,4) T(5,3,2)
-      DATA (CF(I,  9),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00/
-      DATA (CF(I,  9),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,2.000000000000000D+00,5.333333333333333D+00/
+      DATA (CF(I),I= 69, 72) /48,32,12,32/
 C     1 T(3,2) T(5,1,6,4)
-      DATA (CF(I, 10),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I, 10),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I= 73, 75) /48,32,12/
 C     1 T(3,4) T(5,1,6,2)
-      DATA (CF(I, 11),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
-      DATA (CF(I, 11),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
+      DATA (CF(I),I= 76, 77) /48,32/
 C     1 T(5,1,3,2) T(6,4)
-      DATA (CF(I, 12),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I, 12),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
+      DATA (CF(I),I= 78, 78) /48/
 C     1 T(5,1,3,4) T(6,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -761,10 +681,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -773,6 +695,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
index acf1b836af..9e03e92989 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -868,164 +924,43 @@ namespace mg5amcCpu
       jamp_sv[8] += 1. / 2. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_gux_ttxgux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, 16, 16, 6, 0, 16, -2, 0, 0, 16, -2, 0 },
-        { 16, 48, 6, 16, 16, 0, 0, -2, 16, 0, 0, -2 },
-        { 16, 6, 48, 16, -2, 0, 0, 16, -2, 0, 0, 16 },
-        { 6, 16, 16, 48, 0, -2, 16, 0, 0, -2, 16, 0 },
-        { 0, 16, -2, 0, 48, 16, 16, 6, 6, -2, -2, -6 },
-        { 16, 0, 0, -2, 16, 48, 6, 16, -2, 6, -6, -2 },
-        { -2, 0, 0, 16, 16, 6, 48, 16, -2, -6, 6, -2 },
-        { 0, -2, 16, 0, 6, 16, 16, 48, -6, -2, -2, 6 },
-        { 0, 16, -2, 0, 6, -2, -2, -6, 48, 16, 16, 6 },
-        { 16, 0, 0, -2, -2, 6, -6, -2, 16, 48, 6, 16 },
-        { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 },
-        { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -1113,7 +1048,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1148,6 +1087,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1190,6 +1133,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1310,8 +1257,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1319,25 +1266,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1350,7 +1313,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1360,26 +1323,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1392,15 +1356,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1437,33 +1408,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1482,13 +1607,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1500,18 +1619,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1536,93 +1660,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1664,7 +1725,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1687,7 +1748,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1696,21 +1757,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1724,8 +1787,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1741,11 +1806,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1847,14 +1913,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
index 531d6bcd03..c1177b083f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
index f13f023e7d..3b0621c453 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
index a59705bfaf..0a0d60ea62 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc
new file mode 100644
index 0000000000..13c347c712
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc
@@ -0,0 +1,437 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, 16, 16, 6, 0, 16, -2, 0, 0, 16, -2, 0 },
+    { 16, 48, 6, 16, 16, 0, 0, -2, 16, 0, 0, -2 },
+    { 16, 6, 48, 16, -2, 0, 0, 16, -2, 0, 0, 16 },
+    { 6, 16, 16, 48, 0, -2, 16, 0, 0, -2, 16, 0 },
+    { 0, 16, -2, 0, 48, 16, 16, 6, 6, -2, -2, -6 },
+    { 16, 0, 0, -2, 16, 48, 6, 16, -2, 6, -6, -2 },
+    { -2, 0, 0, 16, 16, 6, 48, 16, -2, -6, 6, -2 },
+    { 0, -2, 16, 0, 6, 16, 16, 48, -6, -2, -2, 6 },
+    { 0, 16, -2, 0, 6, -2, -2, -6, 48, 16, 16, 6 },
+    { 16, 0, 0, -2, -2, 6, -6, -2, 16, 48, 6, 16 },
+    { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 },
+    { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/configs.inc
index e6e67b9933..bdaa2e8a30 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/configs.inc
@@ -510,3 +510,5 @@ C     Diagram 35
       DATA (SPROP(I,-4,35),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/35/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
index 7cd8b962cc..980fe65932 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(17)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,109 +434,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,0.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  1),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1, 12) /48,32,32,12,0,32,-4,0,0,32,-4,0/
 C     1 T(1,2,4) T(5,3,6)
-      DATA (CF(I,  2),I=  1,  6) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,0.000000000000000D+00/
-      DATA (CF(I,  2),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,32,0,0,-4/
 C     1 T(1,2,6) T(5,3,4)
-      DATA (CF(I,  3),I=  1,  6) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
-      DATA (CF(I,  3),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,0,0,32/
 C     1 T(1,3,4) T(5,2,6)
-      DATA (CF(I,  4),I=  1,  6) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,0.000000000000000D+00,-6.666666666666666D-01/
-      DATA (CF(I,  4),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA (CF(I),I= 34, 42) /48,0,-4,32,0,0,-4,32,0/
 C     1 T(1,3,6) T(5,2,4)
-      DATA (CF(I,  5),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00/
-      DATA (CF(I,  5),I=  7, 12) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00/
+      DATA (CF(I),I= 43, 50) /48,32,32,12,12,-4,-4,-12/
 C     1 T(1,5,2,4) T(3,6)
-      DATA (CF(I,  6),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01/
-      DATA (CF(I,  6),I=  7, 12) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 51, 57) /48,12,32,-4,12,-12,-4/
 C     1 T(1,5,2,6) T(3,4)
-      DATA (CF(I,  7),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
-      DATA (CF(I,  7),I=  7, 12) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 58, 63) /48,32,-4,-12,12,-4/
 C     1 T(1,5,3,4) T(2,6)
-      DATA (CF(I,  8),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,2.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  8),I=  7, 12) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
+      DATA (CF(I),I= 64, 68) /48,-12,-4,-4,12/
 C     1 T(1,5,3,6) T(2,4)
-      DATA (CF(I,  9),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  9),I=  7, 12) /-6.666666666666666D-01,
-     $ -2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I= 69, 72) /48,32,32,12/
 C     1 T(2,4) T(5,1,3,6)
-      DATA (CF(I, 10),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
-      DATA (CF(I, 10),I=  7, 12) /-2.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01,2.000000000000000D+00,5.333333333333333D+00/
+      DATA (CF(I),I= 73, 75) /48,12,32/
 C     1 T(2,6) T(5,1,3,4)
-      DATA (CF(I, 11),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,-2.000000000000000D+00/
-      DATA (CF(I, 11),I=  7, 12) /2.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
+      DATA (CF(I),I= 76, 77) /48,32/
 C     1 T(3,4) T(5,1,2,6)
-      DATA (CF(I, 12),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,-2.000000000000000D+00,-6.666666666666666D-01/
-      DATA (CF(I, 12),I=  7, 12) /-6.666666666666666D-01
-     $ ,2.000000000000000D+00,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
+      DATA (CF(I),I= 78, 78) /48/
 C     1 T(3,6) T(5,1,2,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -758,10 +680,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -770,6 +694,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
index d34888db6a..b03275381b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,9 +103,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -111,10 +114,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -173,43 +173,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -221,7 +277,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -230,14 +285,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -263,14 +321,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -294,7 +348,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -308,7 +361,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -319,6 +371,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -449,158 +505,43 @@ namespace mg5amcCpu
       jamp_sv[5] -= 1. / 12. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uc_ttxuc()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -688,7 +629,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -723,6 +668,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -765,6 +714,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -885,8 +838,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -894,25 +847,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -925,7 +894,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -935,26 +904,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -967,15 +937,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1012,33 +989,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1057,13 +1188,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1075,18 +1200,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1111,93 +1241,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1239,7 +1306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1262,7 +1329,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1271,21 +1338,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1299,8 +1368,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1316,11 +1387,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1422,14 +1494,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
index 08510dfc85..0b88c815d2 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -80,6 +81,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -127,7 +129,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -135,9 +137,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -157,34 +161,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
index bb9d2c55fb..5eb74ead8d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
index b76b7c4456..71844a31af 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -142,7 +142,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -151,7 +151,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         S2=PDG2PDF(LPP(IB(2)),3, IB(2),XBK(IB(2)), QSCALE)
@@ -243,7 +243,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -321,6 +321,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -406,20 +410,20 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -513,51 +517,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc
new file mode 100644
index 0000000000..a1e583992a
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc
@@ -0,0 +1,431 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/configs.inc
index ddb1d6a390..eb5fc269e9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/configs.inc
@@ -105,3 +105,5 @@ C     Diagram 7
       DATA (SPROP(I,-4,7),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/7/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
index bfe665d186..c79c6062f4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -76,10 +76,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -280,17 +277,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -360,7 +346,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -408,7 +394,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(8)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -451,39 +438,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(3,1) T(5,2) T(6,4)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(3,1) T(5,4) T(6,2)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(3,2) T(5,1) T(6,4)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(3,2) T(5,4) T(6,1)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(3,4) T(5,1) T(6,2)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(3,4) T(5,2) T(6,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -553,10 +533,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -565,6 +547,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
index 66e4b80f71..aa721caff8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -107,9 +109,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -117,10 +120,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -179,43 +179,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -227,7 +283,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -236,14 +291,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -269,14 +327,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -300,7 +354,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -314,7 +367,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -325,6 +377,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -455,158 +511,43 @@ namespace mg5amcCpu
       jamp_sv[5] -= 1. / 36. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_ucx_ttxucx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -694,7 +635,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -729,6 +674,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -771,6 +720,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -891,8 +844,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -900,25 +853,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -931,7 +900,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -941,26 +910,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -973,15 +943,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1018,33 +995,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1063,13 +1194,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1081,18 +1206,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1117,93 +1247,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1245,7 +1312,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1268,7 +1335,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1277,21 +1344,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1305,8 +1374,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1322,11 +1393,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1428,14 +1500,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
index 04b9f5bcb1..fda9e102a7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -86,6 +87,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -133,7 +135,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -141,9 +143,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -163,34 +167,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
index 5046df7e56..4f2b282d2a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
index 848991a32a..071faddb9b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -148,7 +148,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -158,7 +158,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -269,7 +269,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -353,6 +353,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -438,24 +442,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -585,51 +589,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc
new file mode 100644
index 0000000000..a1e583992a
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc
@@ -0,0 +1,431 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/configs.inc
index 6da72c9bac..1a0a5a720d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/configs.inc
@@ -105,3 +105,5 @@ C     Diagram 7
       DATA (SPROP(I,-4,7),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/7/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
index 5dcb5155f3..f6321517c1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -82,10 +82,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -286,17 +283,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -366,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -420,7 +406,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(8)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -463,39 +450,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(2,1) T(3,4) T(5,6)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(2,1) T(3,6) T(5,4)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(2,4) T(3,1) T(5,6)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(2,4) T(3,6) T(5,1)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(2,6) T(3,1) T(5,4)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(2,6) T(3,4) T(5,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -565,10 +545,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -577,6 +559,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
index 8d266e82b7..8703f64023 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -559,158 +615,43 @@ namespace mg5amcCpu
       jamp_sv[5] -= 1. / 12. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uu_ttxuu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -798,7 +739,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -833,6 +778,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -875,6 +824,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -995,8 +948,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1004,25 +957,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1035,7 +1004,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1045,26 +1014,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1077,15 +1047,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1122,33 +1099,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1167,13 +1298,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1185,18 +1310,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1221,93 +1351,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1349,7 +1416,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1372,7 +1439,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1381,21 +1448,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1409,8 +1478,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1426,11 +1497,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1532,14 +1604,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
index fd123d932d..1b094c86d9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
index 77164138e6..286e0ec15a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
index f03c7f3b0c..4e96309281 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc
new file mode 100644
index 0000000000..a1e583992a
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc
@@ -0,0 +1,431 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/configs.inc
index ab6edc7392..c4057663aa 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/configs.inc
@@ -210,3 +210,5 @@ C     Diagram 14
       DATA (SPROP(I,-4,14),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/14/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
index 8b80833180..d9f5b54ed3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(16)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,39 +434,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(3,1) T(5,2) T(6,4)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(3,1) T(5,4) T(6,2)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(3,2) T(5,1) T(6,4)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(3,2) T(5,4) T(6,1)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(3,4) T(5,1) T(6,2)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(3,4) T(5,2) T(6,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -585,10 +565,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -597,6 +579,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
index 1b918bae84..6ffd72682d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -107,9 +109,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -117,10 +120,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -179,43 +179,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -227,7 +283,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -236,14 +291,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -269,14 +327,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -300,7 +354,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -314,7 +367,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -325,6 +377,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -455,158 +511,43 @@ namespace mg5amcCpu
       jamp_sv[5] -= 1. / 12. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uux_ttxccx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -694,7 +635,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -729,6 +674,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -771,6 +720,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -891,8 +844,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -900,25 +853,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -931,7 +900,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -941,26 +910,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -973,15 +943,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1018,33 +995,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1063,13 +1194,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1081,18 +1206,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1117,93 +1247,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1245,7 +1312,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1268,7 +1335,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1277,21 +1344,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1305,8 +1374,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1322,11 +1393,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1428,14 +1500,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
index 87faf25dfb..8fc89f1eaf 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -86,6 +87,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -133,7 +135,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -141,9 +143,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -163,34 +167,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
index e3f26606a1..7bbb5f78a7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
index 74f009d272..b302e0aabb 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -148,7 +148,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -158,7 +158,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -269,7 +269,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -353,6 +353,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -438,24 +442,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -585,51 +589,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc
new file mode 100644
index 0000000000..a1e583992a
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc
@@ -0,0 +1,431 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/configs.inc
index 4cc87fa0bb..ae09f753a1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/configs.inc
@@ -90,3 +90,5 @@ C     Diagram 7
       DATA (SPROP(I,-4,7),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/7/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
index 728711155f..46ce392684 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -82,10 +82,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -286,17 +283,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -366,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -420,7 +406,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(8)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -463,39 +450,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(2,1) T(3,4) T(5,6)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(2,1) T(3,6) T(5,4)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(2,4) T(3,1) T(5,6)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(2,4) T(3,6) T(5,1)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(2,6) T(3,1) T(5,4)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(2,6) T(3,4) T(5,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -565,10 +545,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -577,6 +559,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
index 1c575b7757..7c3e3fa6c7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -868,164 +924,43 @@ namespace mg5amcCpu
       jamp_sv[10] += 1. / 2. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uux_ttxgg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, -6, 16, -2, 16, -2, 6, 6, 0, -2, 16, 0 },
-        { -6, 48, -2, 16, -2, 16, 6, 6, 0, 16, -2, 0 },
-        { 16, -2, 48, -6, 6, 6, 16, -2, -2, 0, 0, 16 },
-        { -2, 16, -6, 48, 6, 6, -2, 16, 16, 0, 0, -2 },
-        { 16, -2, 6, 6, 48, -6, 16, -2, 16, 0, 0, -2 },
-        { -2, 16, 6, 6, -6, 48, -2, 16, -2, 0, 0, 16 },
-        { 6, 6, 16, -2, 16, -2, 48, -6, 0, 16, -2, 0 },
-        { 6, 6, -2, 16, -2, 16, -6, 48, 0, -2, 16, 0 },
-        { 0, 0, -2, 16, 16, -2, 0, 0, 48, 16, 16, 6 },
-        { -2, 16, 0, 0, 0, 0, 16, -2, 16, 48, 6, 16 },
-        { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 },
-        { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -1113,7 +1048,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1148,6 +1087,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1190,6 +1133,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1310,8 +1257,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1319,25 +1266,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1350,7 +1313,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1360,26 +1323,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1392,15 +1356,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1437,33 +1408,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1482,13 +1607,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1500,18 +1619,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1536,93 +1660,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1664,7 +1725,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1687,7 +1748,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1696,21 +1757,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1724,8 +1787,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1741,11 +1806,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1847,14 +1913,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
index 0689624568..b52ac7b6b3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
index 5787ba42b2..91d2c20b98 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
index 75d947b792..ede7c99981 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc
new file mode 100644
index 0000000000..82ceb3958f
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc
@@ -0,0 +1,437 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, -6, 16, -2, 16, -2, 6, 6, 0, -2, 16, 0 },
+    { -6, 48, -2, 16, -2, 16, 6, 6, 0, 16, -2, 0 },
+    { 16, -2, 48, -6, 6, 6, 16, -2, -2, 0, 0, 16 },
+    { -2, 16, -6, 48, 6, 6, -2, 16, 16, 0, 0, -2 },
+    { 16, -2, 6, 6, 48, -6, 16, -2, 16, 0, 0, -2 },
+    { -2, 16, 6, 6, -6, 48, -2, 16, -2, 0, 0, 16 },
+    { 6, 6, 16, -2, 16, -2, 48, -6, 0, 16, -2, 0 },
+    { 6, 6, -2, 16, -2, 16, -6, 48, 0, -2, 16, 0 },
+    { 0, 0, -2, 16, 16, -2, 0, 0, 48, 16, 16, 6 },
+    { -2, 16, 0, 0, 0, 0, 16, -2, 16, 48, 6, 16 },
+    { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 },
+    { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/configs.inc
index e246a996f4..9e83bfd791 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/configs.inc
@@ -480,3 +480,5 @@ C     Diagram 35
       DATA TPRID(-3,35)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/35/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
index 65c377ffc0..859d368c2b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(17)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,111 +434,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.600000000000000D+01,
-     $ -2.000000000000000D+00,5.333333333333333D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  1),I=  7, 12) /2.000000000000000D+00
-     $ ,2.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1, 12) /48,-12,32,-4,32,-4,12,12,0,-4,32,0/
 C     1 T(2,1) T(5,6,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-2.000000000000000D+00
-     $ ,1.600000000000000D+01,-6.666666666666666D-01
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
-      DATA (CF(I,  2),I=  7, 12) /2.000000000000000D+00
-     $ ,2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA (CF(I),I= 13, 23) /48,-4,32,-4,32,12,12,0,32,-4,0/
 C     1 T(2,1) T(6,5,3,4)
-      DATA (CF(I,  3),I=  1,  6) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,1.600000000000000D+01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,2.000000000000000D
-     $ +00/
-      DATA (CF(I,  3),I=  7, 12) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 24, 33) /48,-12,12,12,32,-4,-4,0,0,32/
 C     1 T(2,4) T(5,6,3,1)
-      DATA (CF(I,  4),I=  1,  6) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,-2.000000000000000D+00
-     $ ,1.600000000000000D+01,2.000000000000000D+00,2.000000000000000D
-     $ +00/
-      DATA (CF(I,  4),I=  7, 12) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA (CF(I),I= 34, 42) /48,12,12,-4,32,32,0,0,-4/
 C     1 T(2,4) T(6,5,3,1)
-      DATA (CF(I,  5),I=  1,  6) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,2.000000000000000D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,-2.000000000000000D+00/
-      DATA (CF(I,  5),I=  7, 12) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA (CF(I),I= 43, 50) /48,-12,32,-4,32,0,0,-4/
 C     1 T(3,1) T(5,6,2,4)
-      DATA (CF(I,  6),I=  1,  6) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,2.000000000000000D+00,2.000000000000000D
-     $ +00,-2.000000000000000D+00,1.600000000000000D+01/
-      DATA (CF(I,  6),I=  7, 12) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 51, 57) /48,-4,32,-4,0,0,32/
 C     1 T(3,1) T(6,5,2,4)
-      DATA (CF(I,  7),I=  1,  6) /2.000000000000000D+00
-     $ ,2.000000000000000D+00,5.333333333333333D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  7),I=  7, 12) /1.600000000000000D+01,
-     $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA (CF(I),I= 58, 63) /48,-12,0,32,-4,0/
 C     1 T(3,4) T(5,6,2,1)
-      DATA (CF(I,  8),I=  1,  6) /2.000000000000000D+00
-     $ ,2.000000000000000D+00,-6.666666666666666D-01
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
-      DATA (CF(I,  8),I=  7, 12) /-2.000000000000000D+00
-     $ ,1.600000000000000D+01,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA (CF(I),I= 64, 68) /48,0,-4,32,0/
 C     1 T(3,4) T(6,5,2,1)
-      DATA (CF(I,  9),I=  1,  6) /0.000000000000000D+00
-     $ ,0.000000000000000D+00,-6.666666666666666D-01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  9),I=  7, 12) /0.000000000000000D+00
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I= 69, 72) /48,32,32,12/
 C     1 T(5,2,1) T(6,3,4)
-      DATA (CF(I, 10),I=  1,  6) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,0.000000000000000D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,0.000000000000000D+00/
-      DATA (CF(I, 10),I=  7, 12) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01,2.000000000000000D+00,5.333333333333333D+00/
+      DATA (CF(I),I= 73, 75) /48,12,32/
 C     1 T(5,2,4) T(6,3,1)
-      DATA (CF(I, 11),I=  1,  6) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,0.000000000000000D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,0.000000000000000D+00/
-      DATA (CF(I, 11),I=  7, 12) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
+      DATA (CF(I),I= 76, 77) /48,32/
 C     1 T(5,3,1) T(6,2,4)
-      DATA (CF(I, 12),I=  1,  6) /0.000000000000000D+00
-     $ ,0.000000000000000D+00,5.333333333333333D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
-      DATA (CF(I, 12),I=  7, 12) /0.000000000000000D+00
-     $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
+      DATA (CF(I),I= 78, 78) /48/
 C     1 T(5,3,4) T(6,2,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -761,10 +681,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -773,6 +695,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
index e6d6423d5e..1d17cbcb1b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -559,158 +615,43 @@ namespace mg5amcCpu
       jamp_sv[5] -= 1. / 12. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uux_ttxuux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -798,7 +739,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -833,6 +778,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -875,6 +824,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -995,8 +948,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1004,25 +957,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1035,7 +1004,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1045,26 +1014,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1077,15 +1047,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1122,33 +1099,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1167,13 +1298,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1185,18 +1310,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1221,93 +1351,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1349,7 +1416,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1372,7 +1439,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1381,21 +1448,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1409,8 +1478,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1426,11 +1497,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1532,14 +1604,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
index de4fd12c37..923ce8ceb8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
index 639c7207e3..0f6ceae7f0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
index 8fc5eeb386..cc5891ef22 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc
new file mode 100644
index 0000000000..a1e583992a
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc
@@ -0,0 +1,431 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/configs.inc
index a45cbe8205..cc114056be 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/configs.inc
@@ -195,3 +195,5 @@ C     Diagram 14
       DATA (SPROP(I,-4,14),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/14/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
index 9a6d844439..0070f4afc8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(16)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,39 +434,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(2,1) T(3,4) T(5,6)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(2,1) T(3,6) T(5,4)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(2,4) T(3,1) T(5,6)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(2,4) T(3,6) T(5,1)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(2,6) T(3,1) T(5,4)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(2,6) T(3,4) T(5,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -585,10 +565,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -597,6 +579,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
index bf560d981f..3e1fcb02e3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,9 +103,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -111,10 +114,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -173,43 +173,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -221,7 +277,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -230,14 +285,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -263,14 +321,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -294,7 +348,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -308,7 +361,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -319,6 +371,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -449,158 +505,43 @@ namespace mg5amcCpu
       jamp_sv[5] -= 1. / 12. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uxcx_ttxuxcx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -688,7 +629,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -723,6 +668,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -765,6 +714,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -885,8 +838,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -894,25 +847,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -925,7 +894,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -935,26 +904,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -967,15 +937,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1012,33 +989,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1057,13 +1188,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1075,18 +1200,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1111,93 +1241,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1239,7 +1306,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1262,7 +1329,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1271,21 +1338,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1299,8 +1368,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1316,11 +1387,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1422,14 +1494,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
index 13a02cdb83..a2ab984dd2 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -80,6 +81,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -127,7 +129,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -135,9 +137,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -157,34 +161,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
index bf9951e502..07b686127b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
index 24b0abb30c..107d8a0051 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -142,7 +142,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         CX1=PDG2PDF(LPP(IB(1)),-4, IB(1),XBK(IB(1)), QSCALE)
         UX1=PDG2PDF(LPP(IB(1)),-2, IB(1),XBK(IB(1)), QSCALE)
@@ -151,7 +151,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -243,7 +243,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -321,6 +321,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -406,20 +410,20 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             CX1(IVEC)=PDG2PDF(LPP(IB(1)),-4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             UX1(IVEC)=PDG2PDF(LPP(IB(1)),-2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             DX1(IVEC)=PDG2PDF(LPP(IB(1)),-1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -513,51 +517,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc
new file mode 100644
index 0000000000..a1e583992a
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc
@@ -0,0 +1,431 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/configs.inc
index e2a9d0c352..bc8dbca9d7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/configs.inc
@@ -105,3 +105,5 @@ C     Diagram 7
       DATA (SPROP(I,-4,7),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/7/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
index 2a76dfeffb..3be02200e4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -76,10 +76,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -280,17 +277,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -360,7 +346,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -408,7 +394,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(8)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -451,39 +438,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(1,4) T(2,5) T(3,6)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(1,4) T(2,6) T(3,5)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(1,5) T(2,4) T(3,6)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(1,5) T(2,6) T(3,4)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(1,6) T(2,4) T(3,5)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(1,6) T(2,5) T(3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -553,10 +533,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -565,6 +547,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
index 83faf9192b..26b682be00 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,9 +101,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -109,10 +112,7 @@ namespace mg5amcCpu
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,43 +171,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -219,7 +275,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -228,14 +283,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -261,14 +319,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -292,7 +346,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -306,7 +359,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -317,6 +369,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -559,158 +615,43 @@ namespace mg5amcCpu
       jamp_sv[5] -= 1. / 12. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uxux_ttxuxux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -798,7 +739,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -833,6 +778,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -875,6 +824,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -995,8 +948,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1004,25 +957,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1035,7 +1004,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1045,26 +1014,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1077,15 +1047,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1122,33 +1099,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1167,13 +1298,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1185,18 +1310,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -1221,93 +1351,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1349,7 +1416,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1372,7 +1439,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1381,21 +1448,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1409,8 +1478,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1426,11 +1497,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1532,14 +1604,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
index 0b67fca178..5623c32c4f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,6 +79,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -125,7 +127,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -133,9 +135,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -155,34 +159,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
index f8d2319067..ea3c698850 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
index f9adb0c2a2..5518a456a6 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         CX1=PDG2PDF(LPP(IB(1)),-4, IB(1),XBK(IB(1)), QSCALE)
         SX1=PDG2PDF(LPP(IB(1)),-3, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             CX1(IVEC)=PDG2PDF(LPP(IB(1)),-4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             SX1(IVEC)=PDG2PDF(LPP(IB(1)),-3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             UX1(IVEC)=PDG2PDF(LPP(IB(1)),-2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             DX1(IVEC)=PDG2PDF(LPP(IB(1)),-1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc
new file mode 100644
index 0000000000..a1e583992a
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc
@@ -0,0 +1,431 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/configs.inc
index 9841fb23df..6a4d7d209f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/configs.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/configs.inc
@@ -210,3 +210,5 @@ C     Diagram 14
       DATA (SPROP(I,-4,14),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/14/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
index 35761964e7..5a3d10c673 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(16)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,39 +434,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(1,4) T(2,5) T(3,6)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(1,4) T(2,6) T(3,5)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(1,5) T(2,4) T(3,6)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(1,5) T(2,6) T(3,4)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(1,6) T(2,4) T(3,5)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(1,6) T(2,5) T(3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -585,10 +565,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -597,6 +579,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data b/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/madevent b/epochX/cudacpp/pp_tt012j.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/madevent
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
index 53dd560ed6..da11e740d9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
index 47a3a011b8..a5e188e4f8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
index 76066c7bb1..24e0e80f84 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
index 85f434b58f..ea45eb7817 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,17 +46,16 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t
 INFO: load particles 
 INFO: load vertices 
@@ -73,7 +72,7 @@ INFO: load vertices
 [1;32mDEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
 [1;32mDEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3) [0m
 [1;32mDEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1) [0m
-[1;32mDEBUG: model prefixing  takes 0.12868547439575195 [0m
+[1;32mDEBUG: model prefixing  takes 0.1081535816192627 [0m
 INFO: Change particles name to pass to MG5 convention 
 Defined multiparticle p = g u c d s u~ c~ d~ s~
 Defined multiparticle j = g u c d s u~ c~ d~ s~
@@ -88,21 +87,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 
 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1  
 INFO: Process has 72 diagrams 
-1 processes with 72 diagrams generated in 3.699 s
+1 processes with 72 diagrams generated in 4.226 s
 Total: 1 processes with 72 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt 
 INFO: remove old information in CODEGEN_mad_smeft_gg_tttt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ t t~ @1 
@@ -114,25 +113,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 70 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 1 subprocesses (72 diagrams) in 0.189 s
-Wrote files for 119 helas calls in 0.388 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 70 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 1 subprocesses (72 diagrams) in 0.172 s
+Wrote files for 119 helas calls in 0.454 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 5 routines in  0.319 s
+ALOHA: aloha creates 5 routines in  0.635 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 10 routines in  0.341 s
+ALOHA: aloha creates 10 routines in  0.570 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -142,38 +141,32 @@ ALOHA: aloha creates 10 routines in  0.341 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV9
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV10
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
-INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
 INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses/P1_gg_ttxttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #2 succeeded at 275 (offset 48 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m7.169s
-user	0m6.853s
-sys	0m0.298s
-Code generation completed in 7 seconds
+real	0m9.801s
+user	0m8.912s
+sys	0m0.731s
+Code generation completed in 9 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -186,7 +179,7 @@ Code generation completed in 7 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -194,10 +187,9 @@ Code generation completed in 7 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -216,7 +208,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -224,10 +216,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/COPYRIGHT b/epochX/cudacpp/smeft_gg_tttt.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/COPYRIGHT
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat
index 9bcf8cac8c..33b9ca5c22 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat
index 6b82577032..000832aacd 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat
index b8db871c35..85e1d39035 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt b/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts b/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f b/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc b/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile b/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc b/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h
index 2f711d8cc1..24800c08c9 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc
index 96d77e5403..a1d3c787cf 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_SMEFTsim_topU3l_MwScheme_UFO_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -1394,164 +1450,43 @@ namespace mg5amcCpu
       jamp_sv[8] -= 1. / 2. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
-        { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
-        { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
-        { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
-        { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
-        { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
-        { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
-        { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
-        { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
-        { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
-        { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
-        { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -1639,7 +1574,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1674,6 +1613,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1716,6 +1659,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1836,8 +1783,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1845,25 +1792,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1876,7 +1839,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1886,26 +1849,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1918,15 +1882,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1963,33 +1934,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -2008,13 +2133,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1024 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -2026,18 +2145,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -2062,93 +2186,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -2190,7 +2251,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -2213,7 +2274,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -2222,21 +2283,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -2250,8 +2313,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -2267,11 +2332,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -2373,14 +2439,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h
index d207c3303f..c1de405ab1 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 72; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f
index ef1e17705f..a3462226d4 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f
index 2086a21e98..c42dfd786e 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc
new file mode 100644
index 0000000000..767405ac3b
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc
@@ -0,0 +1,437 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
+    { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
+    { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
+    { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
+    { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
+    { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
+    { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
+    { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
+    { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
+    { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
+    { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
+    { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/configs.inc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/configs.inc
index 3710cb6806..2038dc7a01 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/configs.inc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/configs.inc
@@ -1020,3 +1020,5 @@ C     Diagram 70
       DATA (SPROP(I,-4,70),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/70/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f
index f7f23196eb..5997e65826 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/makefile_original.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f
index 45032ad41c..0f7d6543d2 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -275,17 +272,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -355,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -398,7 +384,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(34)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -441,111 +428,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,0.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  1),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1, 12) /48,32,32,12,0,32,-4,0,-12,-4,-4,12/
 C     1 T(1,2,3,4) T(5,6)
-      DATA (CF(I,  2),I=  1,  6) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,0.000000000000000D+00/
-      DATA (CF(I,  2),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,-4,-12,12,-4/
 C     1 T(1,2,3,6) T(5,4)
-      DATA (CF(I,  3),I=  1,  6) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
-      DATA (CF(I,  3),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,12,-12,-4/
 C     1 T(1,2,5,4) T(3,6)
-      DATA (CF(I,  4),I=  1,  6) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,0.000000000000000D+00,-6.666666666666666D-01/
-      DATA (CF(I,  4),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00/
+      DATA (CF(I),I= 34, 42) /48,0,-4,32,0,12,-4,-4,-12/
 C     1 T(1,2,5,6) T(3,4)
-      DATA (CF(I,  5),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00/
-      DATA (CF(I,  5),I=  7, 12) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA (CF(I),I= 43, 50) /48,32,32,12,0,-4,32,0/
 C     1 T(1,3,4) T(2,5,6)
-      DATA (CF(I,  6),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01/
-      DATA (CF(I,  6),I=  7, 12) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 51, 57) /48,12,32,-4,0,0,32/
 C     1 T(1,3,6) T(2,5,4)
-      DATA (CF(I,  7),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
-      DATA (CF(I,  7),I=  7, 12) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA (CF(I),I= 58, 63) /48,32,32,0,0,-4/
 C     1 T(1,5,4) T(2,3,6)
-      DATA (CF(I,  8),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,2.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  8),I=  7, 12) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA (CF(I),I= 64, 68) /48,0,32,-4,0/
 C     1 T(1,5,6) T(2,3,4)
-      DATA (CF(I,  9),I=  1,  6) /-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  9),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I= 69, 72) /48,32,32,12/
 C     1 T(2,1,3,4) T(5,6)
-      DATA (CF(I, 10),I=  1,  6) /-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,0.000000000000000D+00/
-      DATA (CF(I, 10),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,2.000000000000000D+00,5.333333333333333D+00/
+      DATA (CF(I),I= 73, 75) /48,12,32/
 C     1 T(2,1,3,6) T(5,4)
-      DATA (CF(I, 11),I=  1,  6) /-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
-      DATA (CF(I, 11),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
+      DATA (CF(I),I= 76, 77) /48,32/
 C     1 T(2,1,5,4) T(3,6)
-      DATA (CF(I, 12),I=  1,  6) /2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
-      DATA (CF(I, 12),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
+      DATA (CF(I),I= 78, 78) /48/
 C     1 T(2,1,5,6) T(3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -910,10 +830,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -922,6 +844,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent b/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
index 98fc59d3ea..32bd465108 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
index e394058ac8..bbcb428317 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
index 6d053c0d16..93a221c714 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
index 5444229389..f3ee9f80b4 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,17 +46,16 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t
 INFO: load particles 
 INFO: load vertices 
@@ -73,7 +72,7 @@ INFO: load vertices
 [1;32mDEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
 [1;32mDEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3) [0m
 [1;32mDEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1) [0m
-[1;32mDEBUG: model prefixing  takes 0.12831377983093262 [0m
+[1;32mDEBUG: model prefixing  takes 0.08983516693115234 [0m
 INFO: Change particles name to pass to MG5 convention 
 Defined multiparticle p = g u c d s u~ c~ d~ s~
 Defined multiparticle j = g u c d s u~ c~ d~ s~
@@ -88,33 +87,33 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 
 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1  
 INFO: Process has 72 diagrams 
-1 processes with 72 diagrams generated in 3.671 s
+1 processes with 72 diagrams generated in 3.162 s
 Total: 1 processes with 72 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. 
-Generated helas calls for 1 subprocesses (72 diagrams) in 0.186 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. 
+Generated helas calls for 1 subprocesses (72 diagrams) in 0.227 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 5 routines in  0.316 s
+ALOHA: aloha creates 5 routines in  0.248 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -124,17 +123,17 @@ ALOHA: aloha creates 5 routines in  0.316 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV9
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV10
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
-INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
 INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
 quit
 
-real	0m5.073s
-user	0m4.975s
-sys	0m0.073s
-Code generation completed in 5 seconds
+real	0m4.505s
+user	0m4.403s
+sys	0m0.078s
+Code generation completed in 4 seconds
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/COPYRIGHT b/epochX/cudacpp/smeft_gg_tttt.sa/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/COPYRIGHT
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h
index 2f711d8cc1..24800c08c9 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc
index 6a64c39915..0355ad5663 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_SMEFTsim_topU3l_MwScheme_UFO_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -1342,164 +1398,43 @@ namespace mg5amcCpu
       jamp_sv[8] -= 1. / 2. * amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
-        { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
-        { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
-        { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
-        { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
-        { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
-        { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
-        { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
-        { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
-        { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
-        { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
-        { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -1587,7 +1522,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1622,6 +1561,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1664,6 +1607,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1784,8 +1731,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1793,25 +1740,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1824,7 +1787,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1834,26 +1797,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1866,15 +1830,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1911,33 +1882,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -1956,13 +2081,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1024 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1974,18 +2093,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -2010,93 +2134,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -2138,7 +2199,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -2161,7 +2222,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -2170,21 +2231,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -2198,8 +2261,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -2215,11 +2280,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -2321,14 +2387,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h
index d207c3303f..c1de405ab1 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 72; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc
new file mode 100644
index 0000000000..767405ac3b
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc
@@ -0,0 +1,437 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
+    { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
+    { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
+    { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
+    { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
+    { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
+    { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
+    { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
+    { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
+    { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
+    { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
+    { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/makefile_original.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
index 98fc59d3ea..32bd465108 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
index e394058ac8..bbcb428317 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
index 6d053c0d16..93a221c714 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
index d3c4ca5695..7d34de72f8 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
index 1690ef1273..856e106f98 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -550,21 +549,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1  
 INFO: Process has 6 diagrams 
-1 processes with 6 diagrams generated in 0.124 s
+1 processes with 6 diagrams generated in 0.091 s
 Total: 1 processes with 6 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 
 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t1 t1~ @1 
@@ -576,57 +575,51 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 1 subprocesses (6 diagrams) in 0.009 s
-Wrote files for 16 helas calls in 0.082 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s
+Wrote files for 16 helas calls in 0.096 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 3 routines in  0.186 s
+ALOHA: aloha creates 3 routines in  0.146 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 6 routines in  0.184 s
+ALOHA: aloha creates 6 routines in  0.144 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VVSS1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses/P1_gg_t1t1x; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-Hunk #2 succeeded at 215 (offset -12 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.996s
-user	0m2.690s
-sys	0m0.299s
+real	0m3.181s
+user	0m2.732s
+sys	0m0.440s
 Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
@@ -640,7 +633,7 @@ Code generation completed in 3 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -648,10 +641,9 @@ Code generation completed in 3 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -670,7 +662,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -678,10 +670,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/COPYRIGHT b/epochX/cudacpp/susy_gg_t1t1.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/COPYRIGHT
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat
index 9025117612..a5aa626839 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat
index 6b82577032..000832aacd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat
index b8db871c35..85e1d39035 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt b/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f b/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc b/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile b/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc b/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h
index 5bd3053393..c5e79dc1b1 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc
index 1b3601c86b..aa42f4a070 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_MSSM_SLHA2.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -403,154 +459,43 @@ namespace mg5amcCpu
       jamp_sv[1] += amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_t1t1x()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -578,7 +523,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -611,6 +560,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_Msu3 );
     m_masses.push_back( m_pars->mdl_Msu3 );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_Msu3, (fptype)m_pars->mdl_Wsu3, (fptype)m_pars->mdl_Msu6, (fptype)m_pars->mdl_Wsu6 };
@@ -651,6 +604,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_MSSM_SLHA2::ZERO );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -771,8 +728,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -780,25 +737,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -811,7 +784,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -821,26 +794,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -853,15 +827,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -898,33 +879,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -943,13 +1078,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -961,18 +1090,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -997,93 +1131,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1125,7 +1196,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1148,7 +1219,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1157,21 +1228,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1185,8 +1258,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1202,11 +1277,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1308,14 +1384,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h
index d48c729c48..f01e3c5efd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_MSSM_SLHA2.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 4; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 6; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f
index 28f44ab169..6b4d390131 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f
index 40fbb596f2..05a7d543d8 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc
new file mode 100644
index 0000000000..b68b9250fd
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc
@@ -0,0 +1,427 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/configs.inc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/configs.inc
index cbcfeb2c9a..5e64cc3afe 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/configs.inc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/configs.inc
@@ -42,3 +42,5 @@ C     Diagram 5
       DATA (SPROP(I,-2,5),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/5/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f
index 3fc552a31d..5f9d807b6d 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/makefile_original.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f
index 1a1830b77a..bbf79fd11b 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -215,17 +212,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -295,7 +281,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -338,7 +324,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -383,23 +370,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /5.333333333333333D+00,
-     $ -6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  2) /16,-4/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  2) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
+      DATA (CF(I),I=  3,  3) /16/
 C     1 T(2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WSU3.NE.0D0) FK_MDL_WSU3 = SIGN(MAX(ABS(MDL_WSU3),
-     $    ABS(MDL_MSU3*SMALL_WIDTH_TREATMENT)), MDL_WSU3)
-        IF(MDL_WSU6.NE.0D0) FK_MDL_WSU6 = SIGN(MAX(ABS(MDL_WSU6),
-     $    ABS(MDL_MSU6*SMALL_WIDTH_TREATMENT)), MDL_WSU6)
+        FK_ZERO = 0D0
+        IF(MDL_WSU3.NE.0D0) THEN
+          FK_MDL_WSU3 = SIGN(MAX(ABS(MDL_WSU3), ABS(MDL_MSU3
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WSU3)
+        ELSE
+          FK_MDL_WSU3 = 0D0
+        ENDIF
+
+        IF(MDL_WSU6.NE.0D0) THEN
+          FK_MDL_WSU6 = SIGN(MAX(ABS(MDL_WSU6), ABS(MDL_MSU6
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WSU6)
+        ELSE
+          FK_MDL_WSU6 = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -451,10 +446,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -463,6 +460,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(2)=AMP2(2)+AMP(3)*DCONJG(AMP(3))
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent b/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h
index ec627d7759..85c140d111 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc
index d596fdf1ec..eafa38c4dd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h
index 26a532156c..a9dc1dce79 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
index 45c009959b..0ef608d7aa 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -550,47 +549,47 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1  
 INFO: Process has 6 diagrams 
-1 processes with 6 diagrams generated in 0.122 s
+1 processes with 6 diagrams generated in 0.100 s
 Total: 1 processes with 6 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t1 t1~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. 
 Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 3 routines in  0.182 s
+ALOHA: aloha creates 3 routines in  0.151 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VVSS1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
 quit
 
-real	0m1.324s
-user	0m1.250s
-sys	0m0.065s
-Code generation completed in 1 seconds
+real	0m1.343s
+user	0m1.251s
+sys	0m0.081s
+Code generation completed in 2 seconds
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/COPYRIGHT b/epochX/cudacpp/susy_gg_t1t1.sa/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/COPYRIGHT
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h
index 5bd3053393..c5e79dc1b1 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc
index 1d53b4a535..e8819f6df2 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_MSSM_SLHA2.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -404,154 +460,43 @@ namespace mg5amcCpu
       jamp_sv[1] += amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_t1t1x()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -579,7 +524,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -612,6 +561,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_Msu3 );
     m_masses.push_back( m_pars->mdl_Msu3 );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_Msu3, (fptype)m_pars->mdl_Wsu3, (fptype)m_pars->mdl_Msu6, (fptype)m_pars->mdl_Wsu6 };
@@ -652,6 +605,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_MSSM_SLHA2::ZERO );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -772,8 +729,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -781,25 +738,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -812,7 +785,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -822,26 +795,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -854,15 +828,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -899,33 +880,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -944,13 +1079,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -962,18 +1091,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -998,93 +1132,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1126,7 +1197,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1149,7 +1220,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1158,21 +1229,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1186,8 +1259,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1203,11 +1278,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1309,14 +1385,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h
index d48c729c48..f01e3c5efd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_MSSM_SLHA2.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 4; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 6; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc
new file mode 100644
index 0000000000..b68b9250fd
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc
@@ -0,0 +1,427 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/makefile_original.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h
index ec627d7759..85c140d111 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc
index d596fdf1ec..eafa38c4dd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h
index 26a532156c..a9dc1dce79 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
index d3c4ca5695..7d34de72f8 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
index 9e7dad46ce..96e4d4a727 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -550,21 +549,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.118 s
+1 processes with 3 diagrams generated in 0.083 s
 Total: 1 processes with 3 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4168][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt 
 INFO: remove old information in CODEGEN_mad_susy_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -576,54 +575,49 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s
-Wrote files for 10 helas calls in 0.076 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1576][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1577][0m [0m
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
+Wrote files for 10 helas calls in 0.071 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.137 s
+ALOHA: aloha creates 2 routines in  0.149 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.133 s
+ALOHA: aloha creates 4 routines in  0.122 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
-patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
-patching file driver.f
-patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done.
+[1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
+Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README
+/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.854s
-user	0m2.558s
-sys	0m0.284s
-Code generation completed in 3 seconds
+real	0m3.237s
+user	0m2.790s
+sys	0m0.439s
+Code generation completed in 4 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -636,7 +630,7 @@ Code generation completed in 3 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -644,10 +638,9 @@ Code generation completed in 3 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
@@ -666,7 +659,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.5                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -674,10 +667,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
diff --git a/epochX/cudacpp/susy_gg_tt.mad/COPYRIGHT b/epochX/cudacpp/susy_gg_tt.mad/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/COPYRIGHT
+++ b/epochX/cudacpp/susy_gg_tt.mad/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
index 68b4c46295..311ceaa803 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat
index 25f63a3016..22710756d6 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.5                 2025-10-17         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat
index 6b82577032..000832aacd 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat
index b8db871c35..85e1d39035 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts b/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f b/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc b/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/makefile b/epochX/cudacpp/susy_gg_tt.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/makefile
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc b/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt
index 084e244cea..b55f10804f 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.5
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h
index 5bd3053393..c5e79dc1b1 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 5c62f1bfad..9eb05a51e9 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_MSSM_SLHA2.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -368,154 +424,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -555,7 +500,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -588,6 +537,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -628,6 +581,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_MSSM_SLHA2::ZERO );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -748,8 +705,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -757,25 +714,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -788,7 +761,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -798,26 +771,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -830,15 +804,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -875,33 +856,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -920,13 +1055,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -938,18 +1067,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -974,93 +1108,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1102,7 +1173,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1125,7 +1196,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1134,21 +1205,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1162,8 +1235,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1179,11 +1254,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1285,14 +1361,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 24c27005b8..f74d539775 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_MSSM_SLHA2.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
index bc9bcfeb9b..008afc92ae 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index db3c284caa..fc3ede89c4 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
 
 
-      DO IVEC=1,VECSIZE_USED
-        DSIGUU = ALL_OUT(IVEC)
-        IF (IMODE.EQ.5) THEN
-          IF (DSIGUU.LT.1D199) THEN
-            ALL_OUT(IVEC) = DSIGUU*CONV
-          ELSE
-            ALL_OUT(IVEC) = 0.0D0
-          ENDIF
-          RETURN
+      DO CURR_WARP=1, NB_WARP_USED
+        IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN
+          IB(1) = 1
+          IB(2) = 2
+        ELSE
+          IB(1) = 2
+          IB(2) = 1
         ENDIF
+        DO IWARP=1, WARP_SIZE
+          IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP
+          DSIGUU = ALL_OUT(IVEC)
+          IF (IMODE.EQ.5) THEN
+            IF (DSIGUU.LT.1D199) THEN
+              ALL_OUT(IVEC) = DSIGUU*CONV
+            ELSE
+              ALL_OUT(IVEC) = 0.0D0
+            ENDIF
+            RETURN
+          ENDIF
 
-        XBK(:) = ALL_XBK(:,IVEC)
-C       CM_RAP = ALL_CM_RAP(IVEC)
-        Q2FACT(:) = ALL_Q2FACT(:, IVEC)
+          XBK(:) = ALL_XBK(:,IVEC)
+C         CM_RAP = ALL_CM_RAP(IVEC)
+          Q2FACT(:) = ALL_Q2FACT(:, IVEC)
 
-        IF(FRAME_ID.NE.6)THEN
-          CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
-        ELSE
-          P1 = ALL_PP(:,:,IVEC)
-        ENDIF
-C       call restore_cl_val_to(ivec)
-C       DSIGUU=DSIGUU*REWGT(P1,ivec)
-        DSIGUU=DSIGUU*ALL_RWGT(IVEC)
+          IF(FRAME_ID.NE.6)THEN
+            CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1)
+          ELSE
+            P1 = ALL_PP(:,:,IVEC)
+          ENDIF
+C         call restore_cl_val_to(ivec)
+C         DSIGUU=DSIGUU*REWGT(P1,ivec)
+          DSIGUU=DSIGUU*ALL_RWGT(IVEC)
 
-C       Apply the bias weight specified in the run card (default is
-C        1.0)
-        DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
+C         Apply the bias weight specified in the run card (default is
+C          1.0)
+          DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC)
 
-        DSIGUU=DSIGUU*NFACT
+          DSIGUU=DSIGUU*NFACT
 
-        IF (DSIGUU.LT.1D199) THEN
-C         Set sign of dsig based on sign of PDF and matrix element
-          ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
-     $     *ALL_PD(IPSEL,IVEC))
-        ELSE
-          WRITE(*,*) 'Error in matrix element'
-          DSIGUU=0D0
-          ALL_OUT(IVEC)=0D0
-        ENDIF
-C       Generate events only if IMODE is 0.
-        IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
-C         Call UNWGT to unweight and store events
-          CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
-     $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
-        ENDIF
+          IF (DSIGUU.LT.1D199) THEN
+C           Set sign of dsig based on sign of PDF and matrix element
+            ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU
+     $       *ALL_PD(IPSEL,IVEC))
+          ELSE
+            WRITE(*,*) 'Error in matrix element'
+            DSIGUU=0D0
+            ALL_OUT(IVEC)=0D0
+          ENDIF
+C         Generate events only if IMODE is 0.
+          IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
+C           Call UNWGT to unweight and store events
+            ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP))
+            CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1
+     $       , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
+          ENDIF
+        ENDDO
       ENDDO
 
       END
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b68b9250fd
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc
@@ -0,0 +1,427 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc
index 99d3eecc56..0dbac30825 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc
@@ -24,3 +24,5 @@ C     Diagram 3
       DATA (SPROP(I,-2,3),I=1,1)/0/
 C     Number of configs
       DATA MAPCONFIG(0)/3/
+C     used fake id
+      DATA FAKE_ID/7/
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
index ec5722702a..30cca27587 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
@@ -76,6 +76,7 @@ Program DRIVER
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       INTEGER VECSIZE_USED
+      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
 
       character*255 env_name, env_value
       integer env_length, env_status
@@ -121,7 +122,6 @@ Program DRIVER
       endif
 #endif
 
-      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
       env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
       call get_environment_variable(env_name, env_value, env_length, env_status)
       if( env_status.eq.0 ) then
@@ -147,6 +147,7 @@ Program DRIVER
       FBRIDGE_CBYF1MAX = -1D100
       FBRIDGE_CBYF1MIN = 1D100
 #endif
+
 c
 c     Read process number
 c
@@ -280,6 +281,7 @@ Program DRIVER
 c      write(*,*) 'Final xsec: ',xsec
 
       rewind(lun)
+
       close(lun)
 
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -307,6 +309,7 @@ Program DRIVER
       ENDIF
 #endif
       CALL COUNTERS_FINALISE()
+
       end
 
 c     $B$ get_user_params $B$ ! tag for MadWeight
@@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
index c9610a83ed..35011737bd 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
       DATA NB_FAIL /0/
       DOUBLE PRECISION GET_CHANNEL_CUT
       EXTERNAL GET_CHANNEL_CUT
-C
-      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-      SAVE NGOODHEL
-      DATA NGOODHEL/-1/
+
 C     
 C     This is just to temporarily store the reference grid for
 C      helicity of the DiscreteSampler so as to obtain its number of
@@ -227,17 +224,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
           ENDIF
           IF(NTRY(1).EQ.MAXTRIES)THEN
             ISHEL=MIN(ISUM_HEL,NGOOD)
-C           Print the number of good helicities
-            IF (NGOODHEL.EQ.-1) THEN
-              NGOODHEL=0
-              DO I=1,NCOMB
-                IF (GOODHEL(I,1)) THEN
-                  NGOODHEL=NGOODHEL+1
-                ENDIF
-              END DO
-              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-              WRITE (6,*) 'NCOMB =', NCOMB
-            ENDIF
           ENDIF
         ENDIF
       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
@@ -307,7 +293,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -350,7 +336,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -393,21 +380,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /5.333333333333333D+00,
-     $ -6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  2) /16,-4/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  2) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
+      DATA (CF(I),I=  3,  3) /16/
 C     1 T(2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -446,10 +436,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -458,6 +450,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f
index 9a31ed201d..d6cded9a2d 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
@@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
                 ncolmp=0
              endif
              if(mo_color.gt.1.and.
-     $            mo_color.ne.3.and.mo_color.ne.8)then
+     $            mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then
                 da_color(1)=get_color(jpart(1,ida(1)))
                 da_color(2)=get_color(jpart(1,ida(2)))
                 call write_error(da_color(1), da_color(2), mo_color)
@@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
           endif
          endif !end of check on LC
 
-c       Just zero helicity info for intermediate states
-          jpart(7,i) = 0
+c       Just No helicity info for intermediate states
+          jpart(7,i) = 9
         enddo                   ! do i
  100    continue
         if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor)
@@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             i3=i3+1
 c           color for t-channels needs to be reversed
             if(i3.eq.1) icol(2,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(1,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
 c           color for t-channels needs to be reversed
             if(i3bar.eq.1) icol(1,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires,
             endif
          endif
 c     print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
+      elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
+      elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then
+c         correct
+c         might consider to undo the identical final state for epsilon/epsilonbar 
+          continue
       else
 c     Don't know how to deal with this
          call write_error(i3,i3bar,mo_color)
@@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
          if(icolmp(1,i).gt.0)then
             i3=i3+1
             if(i3.eq.1) icol(1,ires)=icolmp(1,i)
-            if(i3.eq.2) icol(2,ires)=-icolmp(1,i)
+            if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i)
          endif
          if(icolmp(2,i).gt.0)then
             i3bar=i3bar+1
             if(i3bar.eq.1) icol(2,ires)=icolmp(2,i)
-            if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i)
+            if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i)
          endif
       enddo
 
@@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol,
       if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0
 
       if(i3.ne.n3.or.i3bar.ne.n3bar) then
-         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then
+         if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then
 c        This is an epsilon index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(1,ires)=maxcolor
+            if(i3.eq.0) then
+               maxcolor=maxcolor+1
+               icol(1,ires)=maxcolor
+           endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(2,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(2,ires)=-maxcolor
             endif
-         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then
+         elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then
 c        This is an epsilonbar index interaction
 c            write(*,*) i3, n3, i3bar, n3bar, ires
-            maxcolor=maxcolor+1
-            icol(2,ires)=maxcolor
+            if(i3bar.eq.0)then
+                maxcolor=maxcolor+1
+                icol(2,ires)=maxcolor
+            endif
             if(n3.eq.2)then
                maxcolor=maxcolor+1
                icol(1,ires)=-maxcolor
+           elseif(n3bar.eq.2)then
+               maxcolor=maxcolor+1
+               icol(1,ires)=-maxcolor
             endif
          elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or.
      $          n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or.
@@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp,
             if(n3.eq.1) icol(1,ires)=max_n3
             if(n3bar.eq.1) icol(2,ires)=min_n3bar
          endif
+          do i=ires,-1
+               if (icol(1,i).eq.maxcol) icol(1,i)=mincol
+               if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol
+               if (icol(2,i).eq.maxcol) icol(2,i)=mincol
+               if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol
+          enddo         
 c         print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2)
       endif
       else
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f
index 1c32e93f5d..5449ab9e30 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f
@@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1)
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
       logical firsttime
 
       double precision xprop(3,nexternal),tprop(3,nexternal)
@@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config)
       common/to_forest/ iforest, tstrategy
 
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
-      integer tprid(-max_branch:-1,lmaxconfigs)
-      common/to_sprop/sprop,tprid
+      integer tprid(-max_branch:-1,lmaxconfigs), fake_id
+      common/to_sprop/sprop,tprid,fake_id
 
       double precision stot,m1,m2
       common/to_stot/stot,m1,m2
@@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile
deleted file mode 100644
index 49e6800fff..0000000000
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile
+++ /dev/null
@@ -1,327 +0,0 @@
-SHELL := /bin/bash
-
-include ../../Source/make_opts
-
-# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-include ../../src/cudacpp_config.mk
-ifeq ($(CUDACPP_BUILDDIR),)
-$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-endif
-
-# Disable all Fortran warnings?
-FFLAGS+= -w
-
-# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-FFLAGS+= -cpp
-
-# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-
-# Add -std=c++17 explicitly to avoid build errors on macOS
-# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-endif
-
-# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-###  override FC:=ccache $(FC)
-###endif
-
-# Load additional dependencies of the bias module, if present
-ifeq (,$(wildcard ../bias_dependencies))
-BIASDEPENDENCIES =
-else
-include ../bias_dependencies
-endif
-
-# Definitions
-
-LIBDIR = ../../lib/
-BINDIR = ../../bin/
-PROG   = madevent
-
-ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
-    include ../MadLoop_makefile_definitions
-else
-    LINK_LOOP_LIBS =
-    LOOP_LIBS =
-    LOOP_INCLUDE =
-    LINK_MADLOOP_LIB =
-    MADLOOP_LIB =
-endif
-
-LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-
-CUDACPP_MAKEFILE=cudacpp.mk
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-ifeq ($(BACKEND),cuda)
-CUDACPP_COMMONLIB=mg5amc_common_cuda
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-else ifeq ($(BACKEND),hip)
-CUDACPP_COMMONLIB=mg5amc_common_hip
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-else
-CUDACPP_COMMONLIB=mg5amc_common_cpp
-CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-endif
-
-LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
-
-ifneq ("$(wildcard ../../Source/RUNNING)","")
-    LINKLIBS += -lrunning
-    LIBS += $(LIBDIR)librunning.$(libext) 
-endif
-
-
-# Source files
-
-MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
-MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
-ifeq ($(strip $(MATRIX_HEL)),)
-        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
-endif
-
-
-PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
-	 idenparts.o dummy_fct.o
-
-DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-
-SYMMETRY = symmetry.o idenparts.o 
-
-# Binaries
-
-ifeq ($(UNAME),Darwin)
-LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-else
-LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-endif
-
-# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-.DEFAULT_GOAL := all
-
-ifeq ($(BACKEND),cuda)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-else ifeq ($(BACKEND),hip)
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-else
-all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-endif
-
-# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-ifeq ($(USEOPENMP),1)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-LINKLIBS += -liomp5 # see #578
-LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override OMPFLAGS = -fopenmp
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-else
-override OMPFLAGS = -fopenmp
-endif
-endif
-
-$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
-
-$(LIBS): .libs
-
-.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
-	cd ../../Source; make
-	touch $@
-
-$(CUDACPP_BUILDDIR)/.cudacpplibs:
-	$(MAKE) -f $(CUDACPP_MAKEFILE)
-	touch $@
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override LIBFLAGSRPATH =
-else ifeq ($(USEBUILDDIR),1)
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-else
-  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-endif
-
-.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-
-madevent_fortran_link: $(PROG)_fortran
-	rm -f $(PROG)
-	ln -s $(PROG)_fortran $(PROG)
-
-madevent_cuda_link:
-	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-
-madevent_hip_link:
-	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-
-madevent_cpp_link:
-	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-madevent_%_link:
-	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-	rm -f $(PROG)
-	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
-
-# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_cuda now uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-# Building $(PROG)_hip also uses its own rule
-$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-
-counters.o: counters.cc timer.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-
-$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-
-gensym: $(SYMMETRY) configs.inc $(LIBS)
-	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-
-###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-###	cd ../../Source/MODEL; make
-###
-###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-###	cd ../../Source; make
-###
-###$(LIBDIR)libpdf.$(libext): 
-###	cd ../../Source/PDF; make
-###
-###$(LIBDIR)libgammaUPC.$(libext):
-###	cd ../../Source/PDF/gammaUPC; make
-###endif
-
-# Add source so that the compiler finds the DiscreteSampler module.
-$(MATRIX): %.o: %.f
-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%.o: %.f
-	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-%_cudacpp.o: %.f
-	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
-
-# Dependencies
-
-driver.f: genps.inc
-symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
-genps.o: genps.inc nexternal.inc configs.inc
-dummy_fct.0: run.inc genps.inc
-cuts.o: genps.inc nexternal.inc pmass.inc
-setcuts.o: genps.inc run_config.inc
-invarients.o: genps.inc nexternal.inc
-myamp.o: props.inc genps.inc nexternal.inc
-reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
-	    run_config.inc
-cluster.o: cluster.inc genps.inc nexternal.inc message.inc
-addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
-unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
-	 run_config.inc
-initcluster.o: message.inc
-
-# Extra dependencies on discretesampler.mod
-
-auto_dsig.o: .libs
-driver.o: .libs
-driver_cudacpp.o: .libs
-$(MATRIX): .libs
-genps.o: .libs
-
-# Cudacpp bldall targets
-
-ifeq ($(UNAME_P),ppc64le)
-bldavxs: bldnone bldsse4
-else ifeq ($(UNAME_P),arm)
-bldavxs: bldnone bldsse4
-else
-bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-endif
-
-ifneq ($(shell which hipcc 2>/dev/null),)
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldhip bldcuda bldavxs
-else
-bldall: bldhip bldavxs
-endif
-else
-ifneq ($(shell which nvcc 2>/dev/null),)
-bldall: bldcuda bldavxs
-else
-bldall: bldavxs
-endif
-endif
-
-bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-
-bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-
-bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-
-bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-
-bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-
-bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-
-bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-
-# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-
-clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-
-cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-	rm -f .libs
-
-cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-	make -C ../../Source cleanall
-	rm -rf $(LIBDIR)libbias.$(libext)
-	rm -f ../../Source/*.mod ../../Source/*/*.mod
-
-distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile
new file mode 120000
index 0000000000..9fba275947
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile
@@ -0,0 +1 @@
+makefile_wrapper.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk
new file mode 100644
index 0000000000..348c283be7
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk
@@ -0,0 +1,101 @@
+include ../../Source/make_opts
+FFLAGS+= -w
+
+# Load additional dependencies of the bias module, if present
+ifeq (,$(wildcard ../bias_dependencies))
+BIASDEPENDENCIES =
+else
+include ../bias_dependencies
+endif
+
+# Definitions
+
+LIBDIR = ../../lib/
+BINDIR = ../../bin/
+PROG   = madevent
+
+ifneq ("$(wildcard ../MadLoop_makefile_definitions)","")
+    include ../MadLoop_makefile_definitions
+else
+    LINK_LOOP_LIBS =
+    LOOP_LIBS =
+    LOOP_INCLUDE =
+    LINK_MADLOOP_LIB =
+    MADLOOP_LIB =
+endif
+
+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
+
+LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
+
+ifneq ("$(wildcard ../../Source/RUNNING)","")
+    LINKLIBS += -lrunning
+    LIBS += $(LIBDIR)librunning.$(libext) 
+endif
+
+
+# Source files
+
+MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f))
+MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f))
+ifeq ($(strip $(MATRIX_HEL)),)
+        MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f))
+endif
+
+
+PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
+         cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+	 idenparts.o dummy_fct.o \
+         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
+
+SYMMETRY = symmetry.o idenparts.o 
+
+# Binaries
+
+$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
+	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
+
+$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
+	cd ../../Source/MODEL; make
+
+$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+	cd ../../Source; make
+
+$(LIBDIR)libpdf.$(libext): 
+	cd ../../Source/PDF; make
+
+$(LIBDIR)libgammaUPC.$(libext):
+	cd ../../Source/PDF/gammaUPC; make
+
+# Add source so that the compiler finds the DiscreteSampler module.
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+# Dependencies
+
+driver.f: genps.inc
+symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc
+genps.o: genps.inc nexternal.inc configs.inc
+dummy_fct.0: run.inc genps.inc
+cuts.o: genps.inc nexternal.inc pmass.inc
+setcuts.o: genps.inc run_config.inc
+invarients.o: genps.inc nexternal.inc
+myamp.o: props.inc genps.inc nexternal.inc
+reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \
+	    run_config.inc
+cluster.o: cluster.inc genps.inc nexternal.inc message.inc
+addmothers.o: genps.inc nexternal.inc symswap.inc message.inc
+unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+	 run_config.inc
+initcluster.o: message.inc
+
+clean:
+	$(RM) *.o gensym madevent madevent_forhel
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f
index 0a0bafa7c1..9d8fe1c4f0 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f
@@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec)
      $     ' and jcentral is ',jcentral(1),jcentral(2)
 
       if (btest(mlevel,3)) then
-         write(*,'(a$)') 'QCD jets (final): '
+         write(*,'(a,$)') 'QCD jets (final): '
          do i=3,nexternal
-            if(iqjets(i).gt.0) write(*,'(i3$)') i
+            if(iqjets(i).gt.0) write(*,'(i3,$)') i
          enddo
          write(*,*)
       endif
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
@@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec)
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
-      include 'configs.inc'
+      integer fake_id
+      common/to_sprop/sprop,tprid,fake_id
+c      include 'configs.inc'
       real*8 xptj,xptb,xpta,xptl,xmtc
       real*8 xetamin,xqcut,deltaeta
       common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta
@@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec)
      $          ipdgcl(1,igraphs(1),iproc),ipart,.false.).and.
      $        (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then
 c       alpha_s weight
+
+           if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then
               rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref
 c             Store information for systematics studies
               if(use_syst)then
@@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec)
                  write(*,*)'       as: ',alphas(alpsfact*dsqrt(q2now)),
      &                '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref
                  write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale))
+             endif
               endif
            endif
         endif
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f
index 309540a0a2..d0706e90b4 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f
@@ -51,6 +51,7 @@ program symmetry
       integer tstrategy(lmaxconfigs)
       integer sprop(maxsproc,-max_branch:-1,lmaxconfigs)
       integer tprid(-max_branch:-1,lmaxconfigs)
+      integer fake_id
       include 'configs.inc'
       data use_config/0,lmaxconfigs*0/
 
@@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py
index 42d82818d0..2bc6174b85 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir):
     def retro_compatible_custom_fct(lines, mode=None):
 
         f77_type = ['real*8', 'integer', 'double precision', 'logical']
-        function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
+        function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \
                                 % {'type':'|'.join(f77_type)}, re.I+re.M)
         include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") 
         
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..8de498fcc2 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
@@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame):
         else:
             log_level=20
 
-
-        if run_card:
+        if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0):
+            # They are likely case like lpp=+-3, where alpas not need reset
+            # but those have dedicated name of pdf avoid the reset
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
                           'cteq6_l': 0.118, 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py
index 51ae2914fc..0883cd9613 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1,
             gnuplot_output_list=gnuplot_output_list_v5
         else:
             output, _ = p.communicate()
-            output.decode(errors='ignore')
+            output = output.decode(errors='ignore')
             if not output:
                 gnuplot_output_list=gnuplot_output_list_v5
-            elif float(output.split()[1]) < 5. :
+            elif int(output.split()[1].split('.')[0]) < 5 :
                 gnuplot_output_list=gnuplot_output_list_v4
             else:
                 gnuplot_output_list=gnuplot_output_list_v5
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py
index f6e47956cd..d4b94bab10 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay):
             if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]:
                 one_decay = pdg_to_decay[particle.pdg].pop()
                 self.add_decay_to_particle(i, one_decay)
+                particle.helicity = 9
                 return self.add_decays(pdg_to_decay)
+            
+            
         return self
                 
 
@@ -2166,10 +2171,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2961,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..dea35930ea 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3667,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_comine_iteration(self, line):
+    def do_combine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -5596,6 +5635,19 @@ def do_plot(self, line):
             else:
                 logger.info('No valid files for delphes plot')
 
+    def do_compile(self, line):
+        """compile the current directory    """
+
+        args = self.split_arg(line)
+        self.ask_run_configuration(mode='parton')
+        self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat'))
+        self.configure_directory(html_opening =False)
+
+        for Pdir in self.get_Pdir():
+            misc.sprint(Pdir)
+            self.compile(['gensym'], cwd=Pdir)
+            self.compile(['madevent_forhel'], cwd=Pdir)
+
     ############################################################################
     def do_syscalc(self, line):
         """Evaluate systematics variation weights for a given run"""
@@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file.' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6896,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7203,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7223,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/madevent b/epochX/cudacpp/susy_gg_tt.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/madevent
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h
index 9ed58e24f1..f5c68fb7c4 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc
index aa00d6a9e4..0fd9310ffa 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h
index 3e29f2ccbe..5a7f431dc1 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
index 7c6a082392..be5c5a6357 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
index 420090461f..daecfb0066 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
+++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.5                 2025-10-17         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu
 Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
-Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg
+import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -550,45 +549,45 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.118 s
+1 processes with 3 diagrams generated in 0.081 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.136 s
+ALOHA: aloha creates 2 routines in  0.108 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
 quit
 
-real	0m1.291s
-user	0m1.202s
-sys	0m0.072s
-Code generation completed in 2 seconds
+real	0m1.145s
+user	0m1.053s
+sys	0m0.085s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT b/epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT
index e4a5daf207..d5f6746559 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT
+++ b/epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT
@@ -1,4 +1,4 @@
-Copyright (C) 2020-2024 CERN and UCLouvain.
+Copyright (C) 2020-2025 CERN and UCLouvain.
 Licensed under the GNU Lesser General Public License (version 3 or later).
 All rights not expressly granted are reserved.
 
@@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
 CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
 The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
   Jorgen Teig (CERN)
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
index 87aa648dd2..4e3f17e0dd 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
@@ -1,7 +1,8 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -15,10 +16,9 @@
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
 
 //#ifdef __HIPCC__
-//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
-//#else
-//#include <filesystem> // bypass this completely to ease portability on LUMI #803
-//#endif
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
 
 #include <sys/stat.h> // bypass std::filesystem #803
 
@@ -38,9 +38,10 @@ namespace mg5amcCpu
 {
   //--------------------------------------------------------------------------
   /**
-   * A base class for a class whose pointer is passed between Fortran and C++.
-   * This is not really necessary, but it allows minimal type checks on all such pointers.
-   */
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
   struct CppObjectInFortran
   {
     CppObjectInFortran() {}
@@ -49,39 +50,46 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
   /**
-   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
-   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
-   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
-   *
-   * The Fortran momenta passed in are in the form of
-   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
-   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
-   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
-   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
-   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
-   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
-   *
-   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
-   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
-   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
-   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
-   */
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
   template<typename FORTRANFPTYPE>
   class Bridge final : public CppObjectInFortran
   {
   public:
     /**
-     * Constructor
-     *
-     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
-     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
-     */
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
     Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
 
     /**
-     * Destructor
-     */
+   * Destructor
+   */
     virtual ~Bridge() {}
 
     // Delete copy/move constructors and assignment operators
@@ -92,74 +100,70 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
     /**
-     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
-     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
-     *
-     * @param gpublocks number of gpublocks
-     * @param gputhreads number of gputhreads
-     */
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
     void set_gpugrid( const int gpublocks, const int gputhreads );
 
     /**
-     * Sequence to be executed for the Cuda matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void gpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #else
     /**
-     * Sequence to be executed for the vectorized CPU matrix element calculation
-     *
-     * @param momenta the pointer to the input 4-momenta
-     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
-     * @param rndhel the pointer to the input random numbers for helicity selection
-     * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
-     * @param mes the pointer to the output matrix elements
-     * @param selhel the pointer to the output selected helicities
-     * @param selcol the pointer to the output selected colors
-     * @param goodHelOnly quit after computing good helicities?
-     */
-    void cpu_sequence( const FORTRANFPTYPE* momenta,
-                       const FORTRANFPTYPE* gs,
-                       const FORTRANFPTYPE* rndhel,
-                       const FORTRANFPTYPE* rndcol,
-                       const unsigned int* channelIds,
-                       FORTRANFPTYPE* mes,
-                       int* selhel,
-                       int* selcol,
-                       const bool goodHelOnly = false );
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
 #endif
 
-    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
     int nGoodHel() const { return m_nGoodHel; }
 
-    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
     constexpr int nTotHel() const { return CPPProcess::ncomb; }
 
   private:
     unsigned int m_nevt; // number of events
-    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
 
 #ifdef MGONGPUCPP_GPUIMPL
-    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
-    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
     DeviceBufferMomenta m_devMomentaC;
     DeviceBufferGs m_devGs;
@@ -177,8 +181,10 @@ namespace mg5amcCpu
     PinnedHostBufferSelectedColor m_hstSelCol;
     PinnedHostBufferChannelIds m_hstChannelIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
-    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
-    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
 #else
     HostBufferMomenta m_hstMomentaC;
     HostBufferGs m_hstGs;
@@ -217,8 +223,7 @@ namespace mg5amcCpu
 
   template<typename FORTRANFPTYPE>
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
-    : m_nevt( nevtF )
-    , m_nGoodHel( -1 )
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
 #ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
@@ -243,55 +248,90 @@ namespace mg5amcCpu
     , m_hstChannelIds( m_nevt )
     , m_pmek( nullptr )
   {
-    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
-    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
-      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
     {
       m_gputhreads /= 2;
       if( m_gputhreads < s_gputhreadsmin )
-        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
-    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
-    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
     CPPProcess process( /*verbose=*/false );
-    std::string paramCard = "../../Cards/param_card.dat";
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
     /*
 #ifdef __HIPCC__
-    if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#else
-    if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard;
-#endif
-    */
-    //struct stat dummybuffer; // bypass std::filesystem #803
-    //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; //
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
     auto fileExists = []( std::string& fileName )
-    { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; };
-    if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
     process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
-  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
   {
     if( m_nevt != gpublocks * gputhreads )
-      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
-    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
-              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -316,8 +356,12 @@ namespace mg5amcCpu
     else
     {
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
-      const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
-      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -333,8 +377,11 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -342,12 +389,16 @@ namespace mg5amcCpu
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -391,16 +442,22 @@ namespace mg5amcCpu
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
     const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
-    if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
-    //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway)
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
     if( m_nGoodHel < 0 )
     {
       m_nGoodHel = m_pmek->computeGoodHelicities();
-      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
@@ -419,7 +476,8 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
   //
   // Implementations of transposition methods
-  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
@@ -444,30 +502,31 @@ namespace mg5amcCpu
         int rest_2 = rest_1 % ( strd * mome );
         int mome_i = rest_2 / strd;
         int strd_i = rest_2 % strd;
-        int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        out[pos] = in[inpos];        // F2C (Fortran to C)
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
       }
     }
     else
     {
-      // AV attempt another implementation with 1 event per thread: this seems slower...
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
       constexpr int npar = CPPProcess::npar;
       constexpr int np4 = CPPProcess::np4;
       constexpr int neppM = MemoryAccessMomenta::neppM;
-      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
       int ievt = blockDim.x * blockIdx.x + threadIdx.x;
       int ipagM = ievt / neppM;
       int ieppM = ievt % neppM;
       for( int ip4 = 0; ip4 < np4; ip4++ )
         for( int ipar = 0; ipar < npar; ipar++ )
         {
-          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
           int fpos = ievt * npar * np4 + ipar * np4 + ip4;
           out[cpos] = in[fpos]; // F2C (Fortran to C)
         }
@@ -494,23 +553,23 @@ namespace mg5amcCpu
         unsigned int rest_2 = rest_1 % ( strd * mome );
         unsigned int mome_i = rest_2 / strd;
         unsigned int strd_i = rest_2 % strd;
-        unsigned int inpos =
-          ( page_i * strd + strd_i ) // event number
-            * ( part * mome )        // event size (pos of event)
-          + part_i * mome            // particle inside event
-          + mome_i;                  // momentum inside particle
-        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
-          out[pos] = in[inpos];      // F2C (Fortran to C)
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
         else
           out[inpos] = in[pos]; // C2F (C to Fortran)
       }
     }
     else
     {
-      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
-      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
-      // F-style: AOS[nevtF][nparF][np4F]
-      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
       constexpr unsigned int npar = CPPProcess::npar;
       constexpr unsigned int np4 = CPPProcess::np4;
       constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
@@ -521,14 +580,16 @@ namespace mg5amcCpu
       else
       {
         const unsigned int npagM = nevt / neppM;
-        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
         for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
           for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
             for( unsigned int ipar = 0; ipar < npar; ipar++ )
               for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
               {
                 unsigned int ievt = ipagM * neppM + ieppM;
-                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
                 unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
                 if constexpr( F2C )
                   out[cpos] = in[fpos]; // F2C (Fortran to C)
@@ -554,5 +615,5 @@ namespace mg5amcCpu
   }
 
   //--------------------------------------------------------------------------
-}
+} // namespace mg5amcGpu
 #endif // BRIDGE_H
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..5ede45b123 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,27 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +352,80 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +442,55 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +498,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +518,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..16f8874888 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,21 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +217,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h
index 5bd3053393..c5e79dc1b1 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc
index 6867c6d67d..81057d8134 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_MSSM_SLHA2.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,9 +98,10 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
   //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
@@ -106,10 +109,7 @@ namespace mg5amcCpu
   using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,43 +168,99 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
 #endif
-                           )
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -216,7 +272,6 @@ namespace mg5amcCpu
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
     using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
@@ -225,14 +280,17 @@ namespace mg5amcCpu
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
-#endif /* clang-format on */
+#endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
 #ifndef MGONGPUCPP_GPUIMPL
     //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
+    //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel );
+#endif /* clang-format on */
 
     // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
     // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
@@ -258,14 +316,10 @@ namespace mg5amcCpu
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // START LOOP ON IPARITY
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
@@ -289,7 +343,6 @@ namespace mg5amcCpu
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = allNumerators;
       fptype* denominators = allDenominators;
@@ -303,7 +356,6 @@ namespace mg5amcCpu
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
@@ -314,6 +366,10 @@ namespace mg5amcCpu
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // SCALAR channelId for the current event (CUDA)
+      unsigned int channelId = gpu_channelId( allChannelIds );
+#endif
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
@@ -365,154 +421,43 @@ namespace mg5amcCpu
       jamp_sv[1] -= amp_sv[0];
 
       // *** COLOR CHOICE BELOW ***
+
       // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
+      {
         for( int icol = 0; icol < ncolor; icol++ )
           jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
       {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
         for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
-      fptype_sv deltaMEs_previous = { 0 };
+#endif /* clang-format on */
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
       for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
 #endif
-      }
-
-      // *** STORE THE RESULTS ***
+    }
+    // END LOOP ON IPARITY
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -552,7 +497,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -585,6 +534,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -625,6 +578,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_MSSM_SLHA2::ZERO );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -745,8 +702,8 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -754,25 +711,41 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype_sv* allJamps,        // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr fptype_sv* allJamp2s = nullptr;        // no need for color selection during helicity filtering
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -785,7 +758,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -795,26 +768,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -827,15 +801,22 @@ namespace mg5amcCpu
         }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -872,33 +853,187 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+  void /* clang-format off */
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
   {
@@ -917,13 +1052,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -935,18 +1064,23 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
+    // *** PART 0b - C++ ***
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
     {
@@ -971,93 +1105,30 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
-    }
-    else
-    {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
-    }
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1099,7 +1170,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1122,7 +1193,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1131,21 +1202,23 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
@@ -1159,8 +1232,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1176,11 +1251,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1282,14 +1358,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h
index 24c27005b8..f74d539775 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_MSSM_SLHA2.h"
 
 #include <vector>
@@ -75,6 +76,7 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
@@ -122,7 +124,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -130,9 +132,11 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype_sv* allJamps,        // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -152,34 +156,45 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b68b9250fd
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc
@@ -0,0 +1,427 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/cudacpp_overlay.mk
new file mode 120000
index 0000000000..181212c4c6
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/cudacpp_overlay.mk
@@ -0,0 +1 @@
+../cudacpp_overlay.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.h
new file mode 120000
index 0000000000..067632d2b4
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.h
@@ -0,0 +1 @@
+../fbridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/makefile_original.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/makefile_original.mk
new file mode 120000
index 0000000000..953b628165
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/makefile_original.mk
@@ -0,0 +1 @@
+../makefile_original.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h
new file mode 100644
index 0000000000..7d5014a138
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc
index 4eec5db13c..678eb8c34e 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h
index 9ed58e24f1..f5c68fb7c4 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc
index aa00d6a9e4..0fd9310ffa 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h
index 3e29f2ccbe..5a7f431dc1 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
index d3c4ca5695..7d34de72f8 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk
index f703a1ae7c..48b2037dc2 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk
@@ -1,10 +1,20 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
 # Compiler-specific googletest build directory (#125 and #738)
 # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
 # In epoch1/epoch2, CXXNAMESUFFIX is undefined
@@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
-	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
 
 googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
 	$(MAKE) -C googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/tmad/allTees.sh b/epochX/cudacpp/tmad/allTees.sh
index eb39e2b302..17367f7f6b 100755
--- a/epochX/cudacpp/tmad/allTees.sh
+++ b/epochX/cudacpp/tmad/allTees.sh
@@ -1,23 +1,41 @@
 #!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (May 2022) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 scrdir=$(cd $(dirname $0); pwd)
 
 host=$(hostname)
 if [ "${host/juwels}" != "${host}" ]; then ${scrdir}/juwelspatch.sh; fi # workaround for #498
 
+# Usage
+function usage()
+{
+  echo "Usage (1): $0 [-short|-ggttggg] [-bsmonly|-nobsm] [-makeclean] [+10x] [-hip]"
+  echo "Run tests and check all logs"
+  echo ""
+  echo "Usage (2): $0 -checkonly"
+  echo "Check existing logs without running any tests"
+  exit 1
+}
+
+# Parse command line arguments
+checkonly=0
 short=0
 bsm=
 flts=-dmf # "d m f" (alternative: -d_f i.e. "d f")
 makeclean=
 rmrdat=
-add10x="+10x"
+add10x=
 hip=
-
-while [ "$1" != "" ]; do
+if [ "$1" == "-checkonly" ]; then
+  # Check existing logs without running any tests?
+  checkonly=1
+  shift
+  if [ "$1" != "" ]; then usage; fi
+fi
+while [ "${checkonly}" == "0" ] && [ "$1" != "" ]; do
   if [ "$1" == "-short" ]; then
     short=1 # all (possibly including bsm) but ggttggg
     shift
@@ -27,8 +45,8 @@ while [ "$1" != "" ]; do
   elif [ "$1" == "-makeclean" ]; then
     makeclean=$1
     shift
-  elif [ "$1" == "-no10x" ]; then
-    add10x=""
+  elif [ "$1" == "+10x" ]; then
+    add10x=$1
     shift
   elif [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then
     bsm=$1
@@ -40,43 +58,73 @@ while [ "$1" != "" ]; do
     hip=$1
     shift
   else
-    echo "Usage: $0 [-short|-ggttggg] [-bsmonly|-nobsm] [-makeclean] [-no10x] [-hip]"
-    exit 1
+    usage
   fi
 done
 
-started="STARTED  AT $(date)"
-
-if [ "${bsm}" != "-bsmonly" ]; then
-  if [ "$short" == "1" ]; then
-    ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq $flts $makeclean $rmrdat $add10x $hip
-  elif [ "$short" == "-1" ]; then
-    ${scrdir}/teeMadX.sh -ggttggg $flts $makeclean $rmrdat $add10x $hip
-  else
-    ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg $flts $makeclean $rmrdat $add10x $hip
+# Run all tests
+if [ "${checkonly}" == "0" ]; then
+  started="STARTED  AT $(date)"
+  # SM tests
+  if [ "${bsm}" != "-bsmonly" ]; then
+    if [ "$short" == "1" ]; then
+      ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq $flts $makeclean $rmrdat $add10x $hip
+    elif [ "$short" == "-1" ]; then
+      ${scrdir}/teeMadX.sh -ggttggg $flts $makeclean $rmrdat $add10x $hip
+    else
+      ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg $flts $makeclean $rmrdat $add10x $hip
+    fi
   fi
-fi
-status=$?
-ended1="(SM tests)\nENDED(1) AT $(date) [Status=$status]"
-
-if [ "${bsm}" != "-nobsm" ]; then
-  if [ "$short" != "-1" ]; then
-    ${scrdir}/teeMadX.sh -heftggbb -susyggtt -susyggt1t1 -smeftggtttt $flts $makeclean $rmrdat $add10x $hip
+  status=$?
+  ended1="(SM tests)\nENDED(1) AT $(date) [Status=$status]"
+  # BSM tests
+  if [ "${bsm}" != "-nobsm" ]; then
+    if [ "$short" != "-1" ]; then
+      ${scrdir}/teeMadX.sh -heftggbb -susyggtt -susyggt1t1 -smeftggtttt $flts $makeclean $rmrdat $add10x $hip
+    fi
   fi
+  status=$?
+  ended2="(BSM tests)\nENDED(1) AT $(date) [Status=$status]"
+  # Timing information
+  echo
+  printf "\n%80s\n" |tr " " "#"
+  echo
+  echo -e "$started"
+  echo -e "$ended1"
+  echo -e "$ended2"
+  echo
 fi
-status=$?
-ended2="(BSM tests)\nENDED(1) AT $(date) [Status=$status]"
 
 # Print out the number of "OK!"s in each log (expect 24)
+for f in ${scrdir}/logs_*_mad/log_*; do echo $(cat $f | grep OK  | wc -l) $f; done # expect 24
+
+# Print out any errors or aborts in the logs
 echo
-printf "\n%80s\n" |tr " " "#"
+txt=$(egrep -i '(error|abort)' tmad/logs* -r | sed 's/:0:rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/')
+if [ "${txt}" == "" ]; then
+  echo "No errors or aborts found in logs"
+else
+  echo "${txt}"
+fi
+  
+# Print out any asserts in the logs
 echo
-echo -e "$started"
-echo -e "$ended1"
-echo -e "$ended2"
+txt=$(grep assert tmad/logs* -r | sed "s/Gpu.*Assert/Assert/")
+if [ "${txt}" == "" ]; then
+  echo "No asserts found in logs"
+else
+  echo "${txt}"
+fi
+  
+# Print out any segfaults in the logs
 echo
-for f in ${scrdir}/logs_*_mad/log_*; do echo $(cat $f | grep OK  | wc -l) $f; done # expect 24
-
+txt=$(grep -i segmentation tmad/logs* -r | sed "s/Gpu.*Assert/Assert/")
+if [ "${txt}" == "" ]; then
+  echo "No segmentation fault found in logs"
+else
+  echo "${txt}"
+fi
+  
 # Print out the MEK channelid debugging output
 echo
 \grep MEK ${scrdir}/logs_*/* | sed "s|${scrdir}/logs_||" | sed 's|_mad.*DEBUG:||' | sort -u
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index c9c9460105..9875c9cf7a 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
-make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
+make USEBUILDDIR=1 BACKEND=cuda
 
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:04:14
+DATE: 2025-10-11_17:08:31
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 3837 events (found 8192 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7444s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7368s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0075s for     8192 events => throughput is 1.09E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7544s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7467s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0077s for     8192 events => throughput is 1.07E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2176s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2101s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0075s for     8192 events => throughput is 1.09E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2221s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2144s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0077s for     8192 events => throughput is 1.06E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2197s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2123s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0071s for     8192 events => throughput is 1.15E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2222s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2147s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0072s for     8192 events => throughput is 1.14E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.158620e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.149454e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.163690e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.182730e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2221s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2173s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0045s for     8192 events => throughput is 1.81E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2208s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2160s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0045s for     8192 events => throughput is 1.82E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.887925e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.914270e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.991506e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.995666e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2160s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2124s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.50E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2170s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2130s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0037s for     8192 events => throughput is 2.23E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.590914e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.533255e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.667984e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.641624e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2167s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2131s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.46E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2163s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2127s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0034s for     8192 events => throughput is 2.41E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.636316e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.651338e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.730901e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.725193e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2188s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2145s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0041s for     8192 events => throughput is 2.01E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2180s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2136s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0041s for     8192 events => throughput is 1.98E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.085135e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.065060e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.218811e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.156200e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6526s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6492s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.88E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.6520s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6479s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.21E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0034s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -348,44 +355,44 @@ OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789448173971E-00
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.299210e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.427727e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.632885e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.442402e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.507229e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.123576e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.868548e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.069823e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.543060e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.084747e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.911449e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.494944e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.533062e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.063740e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.164979e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.415941e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index 13ceac3a87..fbf3c34fcc 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
-make USEBUILDDIR=1 BACKEND=cuda
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make USEBUILDDIR=1 BACKEND=cppsse4
-
 
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:04:39
+DATE: 2025-10-11_17:08:56
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 3837 events (found 8192 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7443s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7370s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0073s for     8192 events => throughput is 1.12E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7580s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7502s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0077s for     8192 events => throughput is 1.06E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2183s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2108s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0075s for     8192 events => throughput is 1.09E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2217s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2138s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432777382586498E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2266s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2197s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0067s for     8192 events => throughput is 1.21E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2214s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2142s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0070s for     8192 events => throughput is 1.18E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432777382586498E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.221258e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.197154e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.225429e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.200720e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432774839452045E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2220s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2190s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.89E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2161s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2132s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 2.99E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774839452045E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.137547e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.577999e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.221144e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.183473e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2228s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2200s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.13E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2183s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2155s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.17E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.328121e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.468253e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.556846e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.468239e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2241s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2212s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.09E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2199s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2171s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.19E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.452418e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.276853e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.604389e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.494548e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432778556608516E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2173s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2144s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.08E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2182s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2152s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.90E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432778556608516E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.402847e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.354967e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.641263e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.469737e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.09243 [9.2432780016531851E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.09243 [9.2432779972212775E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6500s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6467s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.92E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.6719s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6677s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.25E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0036s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432780016531851E-002) differ by less than 4E-4 (1.0203783951112655e-07)
+OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432779972212775E-002) differ by less than 4E-4 (1.0251731308308365e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.451436e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.421145e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.688055e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.263812e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.014252e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.466407e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.229387e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.768150e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.787718e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.574848e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.220221e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.510215e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.380548e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.891814e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.826286e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.714240e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 093bec81e5..07ac440ea1 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
-make USEBUILDDIR=1 BACKEND=cuda
 
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cuda
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:04:27
+DATE: 2025-10-11_17:08:44
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 3837 events (found 8192 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7605s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7527s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0078s for     8192 events => throughput is 1.06E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7547s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7469s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0078s for     8192 events => throughput is 1.05E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2221s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2144s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0077s for     8192 events => throughput is 1.06E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2206s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2128s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0078s for     8192 events => throughput is 1.05E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2212s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2136s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0073s for     8192 events => throughput is 1.12E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2248s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2169s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0076s for     8192 events => throughput is 1.08E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.133245e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.138160e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.115304e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.141490e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2168s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2123s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0042s for     8192 events => throughput is 1.94E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2174s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2129s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0043s for     8192 events => throughput is 1.90E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.993139e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.989196e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.058944e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.027429e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2174s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2138s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0034s for     8192 events => throughput is 2.42E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2195s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2156s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0036s for     8192 events => throughput is 2.30E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.549665e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.540266e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.708708e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.722635e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2179s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2143s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.49E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2175s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2136s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0036s for     8192 events => throughput is 2.26E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.606715e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.634053e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.748967e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.703762e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2165s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2123s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0039s for     8192 events => throughput is 2.10E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2186s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2143s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0040s for     8192 events => throughput is 2.06E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.203720e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.160546e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.284212e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.303805e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.09243 [9.2432789437826970E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.09243 [9.2432789453073233E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6505s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6470s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.77E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.6515s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6475s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.22E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0033s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789437826970E-002) differ by less than 2E-4 (1.1194101201539297e-10)
+OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789453073233E-002) differ by less than 2E-4 (5.3003379463234523e-11)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.269035e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.593291e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.550305e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.163347e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.523745e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.056075e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.857337e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.054571e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.551254e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.089599e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.897534e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.480305e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.503798e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.035852e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.184430e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.419141e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index 794f102690..9182ca8a9b 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
-
-
 make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:04:53
+DATE: 2025-10-11_17:09:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 2613 events (found 5374 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8494s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8073s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0421s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8533s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8106s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0426s for     8192 events => throughput is 1.92E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4510s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4085s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0425s for     8192 events => throughput is 1.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4516s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4087s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0430s for     8192 events => throughput is 1.91E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4555s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4098s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0453s for     8192 events => throughput is 1.81E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
+ [COUNTERS] PROGRAM TOTAL          :    0.4606s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4148s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0454s for     8192 events => throughput is 1.80E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.856020e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.822539e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.865986e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.841641e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4352s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4103s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0245s for     8192 events => throughput is 3.34E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4390s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4130s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0257s for     8192 events => throughput is 3.19E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.314758e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.221117e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.321531e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.252405e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4235s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4077s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0154s for     8192 events => throughput is 5.30E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4339s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4171s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0164s for     8192 events => throughput is 4.99E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.263509e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.116784e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.327379e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.216981e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4237s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4087s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0147s for     8192 events => throughput is 5.58E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
+ [COUNTERS] PROGRAM TOTAL          :    0.4313s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4153s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0156s for     8192 events => throughput is 5.24E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.648502e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.229787e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.831851e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.438042e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4297s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4071s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0222s for     8192 events => throughput is 3.69E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4415s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4172s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0239s for     8192 events => throughput is 3.42E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034169) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.526689e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.514185e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.574003e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.539500e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8534s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8496s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.68E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
+ [COUNTERS] PROGRAM TOTAL          :    0.8618s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8570s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.20E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0040s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cuda (47.138611968034176) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (47.138611968034176) and cuda (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.103830e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.853419e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.448285e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.409968e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.875229e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.832304e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.627647e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.660331e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.886865e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.861253e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.006782e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.014024e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.862106e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.853068e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.715892e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.417253e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 2bf2a37cc7..7fd8a9128c 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
-
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make USEBUILDDIR=1 BACKEND=cppsse4
 
+
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:05:22
+DATE: 2025-10-11_17:09:38
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 2613 events (found 5374 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8450s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8027s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0423s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8468s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8038s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0430s for     8192 events => throughput is 1.91E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4516s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4094s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4561s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4127s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0434s for     8192 events => throughput is 1.89E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138606099989779] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4548s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4118s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0426s for     8192 events => throughput is 1.92E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4596s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4159s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0434s for     8192 events => throughput is 1.89E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138606099989779) differ by less than 4E-4 (1.2448487851646206e-07)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138606099989779) differ by less than 4E-4 (1.2448487873850667e-07)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.973574e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.924656e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.981282e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.925228e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138602111070696] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4326s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4154s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0170s for     8192 events => throughput is 4.81E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4334s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4155s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0176s for     8192 events => throughput is 4.64E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138602111070696) differ by less than 4E-4 (2.091059336795098e-07)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138602111070696) differ by less than 4E-4 (2.091059339015544e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.659841e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.677131e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.743814e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.687091e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4174s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4080s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0091s for     8192 events => throughput is 8.97E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4249s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4152s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0095s for     8192 events => throughput is 8.65E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138602499179925) differ by less than 4E-4 (2.008725722424387e-07)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138602499179925) differ by less than 4E-4 (2.0087257257550561e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.079796e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.918801e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.235810e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.134969e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4181s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4092s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0086s for     8192 events => throughput is 9.50E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4245s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4152s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0091s for     8192 events => throughput is 9.01E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138602499179925) differ by less than 4E-4 (2.008725722424387e-07)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138602499179925) differ by less than 4E-4 (2.0087257257550561e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.970038e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.308113e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.765544e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.304031e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138606840950104] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4258s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4131s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0124s for     8192 events => throughput is 6.60E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4294s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4163s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0128s for     8192 events => throughput is 6.41E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138606840950104) differ by less than 4E-4 (1.0876612277499476e-07)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138606840950104) differ by less than 4E-4 (1.0876612310806166e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.636236e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.713633e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.862568e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.787911e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138612402172164] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138612400084860] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8671s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8634s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.65E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.8642s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8595s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 1.07E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0039s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cuda (47.138612402172164) differ by less than 4E-4 (9.209817353195149e-09)
+OK! xsec from fortran (47.138611968034176) and cuda (47.138612400084860) differ by less than 4E-4 (9.16553677399179e-09)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.093880e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.299593e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.450343e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.634270e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.021092e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.759880e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.359313e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.744455e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.014796e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.777428e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.375647e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.990089e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.628808e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.374093e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.004427e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.364214e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 2ae843d323..e56bc4eee0 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -1,4 +1,7 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 
 make USEBUILDDIR=1 BACKEND=cuda
@@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda
 make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:05:08
+DATE: 2025-10-11_17:09:23
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 2613 events (found 5374 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8439s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8015s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0424s for     8192 events => throughput is 1.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8528s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8099s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0429s for     8192 events => throughput is 1.91E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4498s
+ [COUNTERS] PROGRAM TOTAL          :    0.4512s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.4080s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0418s for     8192 events => throughput is 1.96E+05 events/s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0433s for     8192 events => throughput is 1.89E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138613306947967] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4576s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4121s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0451s for     8192 events => throughput is 1.82E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4607s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4140s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0463s for     8192 events => throughput is 1.77E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759566586473e-08)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759344541868e-08)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.815647e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.819635e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.845071e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.820245e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138613306947953] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4358s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4106s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0248s for     8192 events => throughput is 3.31E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4365s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4109s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0253s for     8192 events => throughput is 3.24E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759344541868e-08)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759122497263e-08)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.291111e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.279259e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.339005e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.279521e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4251s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4094s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0154s for     8192 events => throughput is 5.31E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4291s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4132s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0154s for     8192 events => throughput is 5.30E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.315398e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.322301e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.422217e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.904240e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4227s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4081s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0143s for     8192 events => throughput is 5.73E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4297s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4143s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0151s for     8192 events => throughput is 5.44E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.854463e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.558424e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.901611e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.634376e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4322s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4099s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0219s for     8192 events => throughput is 3.74E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4402s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4164s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0235s for     8192 events => throughput is 3.49E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.724588e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.654630e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.694617e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.679375e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611963547788] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138613294297848] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8506s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8468s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.66E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
+ [COUNTERS] PROGRAM TOTAL          :    0.8631s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8584s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.15E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0041s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cuda (47.138611963547788) differ by less than 2E-4 (9.517409083059647e-11)
+OK! xsec from fortran (47.138611968034176) and cuda (47.138613294297848) differ by less than 2E-4 (2.8135399343653944e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.987528e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.912312e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.325954e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.471933e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.868584e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.863402e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.589038e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.634047e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.871326e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.849540e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.949192e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.953899e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.873573e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.847641e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.717025e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.416006e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index 0c7ed732ed..d8d6f34ca2 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:05:36
+DATE: 2025-10-11_17:09:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 387 events (found 1591 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7416s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4124s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3292s for     8192 events => throughput is 2.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7558s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4158s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3400s for     8192 events => throughput is 2.41E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7177s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3873s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3304s for     8192 events => throughput is 2.48E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7272s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3869s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3403s for     8192 events => throughput is 2.41E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7353s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3872s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3470s for     8192 events => throughput is 2.36E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7509s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3914s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3585s for     8192 events => throughput is 2.29E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (0.0)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.455924e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.384792e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.454100e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.379994e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5656s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3857s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1793s for     8192 events => throughput is 4.57E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5787s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3912s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1868s for     8192 events => throughput is 4.39E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748567E-002) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748567E-002) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.669927e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.477039e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.620836e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.489628e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4792s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3884s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0903s for     8192 events => throughput is 9.07E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4876s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3928s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0942s for     8192 events => throughput is 8.69E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.331277e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.903439e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.327490e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.886830e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4693s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3876s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0812s for     8192 events => throughput is 1.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4804s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3924s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0874s for     8192 events => throughput is 9.37E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.048553e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.779459e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.042752e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.857066e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748581E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5035s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3879s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1150s for     8192 events => throughput is 7.13E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5118s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3923s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1188s for     8192 events => throughput is 6.90E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748581E-002) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.198283e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.951589e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.275587e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.994069e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -334,58 +341,58 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8395s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8270s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0095s for     8192 events => throughput is 8.61E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0030s
+ [COUNTERS] PROGRAM TOTAL          :    0.8402s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8333s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0013s for     8192 events => throughput is 6.17E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0056s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471485809748553E-002) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471485809748553E-002) differ by less than 3E-14 (0.0)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111479e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.930684e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.523607e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.049354e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.454522e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.010359e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.167720e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.220373e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.412863e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.008910e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.174227e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.368579e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.441638e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.010569e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.653840e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.799070e+06                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index fbc0c57cb4..405a8e9845 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+
 
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
-
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:06:11
+DATE: 2025-10-11_17:10:26
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 387 events (found 1591 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7420s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4102s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3318s for     8192 events => throughput is 2.47E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7519s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4121s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3398s for     8192 events => throughput is 2.41E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7176s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3860s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3316s for     8192 events => throughput is 2.47E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7271s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3864s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3408s for     8192 events => throughput is 2.40E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07847 [7.8471473453718410E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7234s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3899s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3325s for     8192 events => throughput is 2.46E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7291s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3913s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3369s for     8192 events => throughput is 2.43E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471473453718410E-002) differ by less than 4E-4 (1.574588530672827e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471473453718410E-002) differ by less than 4E-4 (1.5745885295626039e-07)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.535876e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.486290e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.542086e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.478806e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471459294758378E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471459219682932E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4904s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3886s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1014s for     8192 events => throughput is 8.08E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4955s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3907s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1044s for     8192 events => throughput is 7.85E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459294758378E-002) differ by less than 4E-4 (3.37893311330717e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459219682932E-002) differ by less than 4E-4 (3.3885003380973444e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.182689e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.993300e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.204950e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.004232e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471459718665412E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4358s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3891s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0464s for     8192 events => throughput is 1.77E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4415s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3925s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0486s for     8192 events => throughput is 1.69E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459718665412E-002) differ by less than 4E-4 (3.324912595248364e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459708731872E-002) differ by less than 4E-4 (3.3261784726512644e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.782969e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.733359e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.783579e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.722443e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471459718665412E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4301s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3871s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0426s for     8192 events => throughput is 1.92E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4378s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3922s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0452s for     8192 events => throughput is 1.81E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459718665412E-002) differ by less than 4E-4 (3.324912595248364e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459708731872E-002) differ by less than 4E-4 (3.3261784726512644e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.968891e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.850143e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.969858e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.891286e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471471932611128E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471471746130506E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4447s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3892s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0551s for     8192 events => throughput is 1.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4526s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3929s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0592s for     8192 events => throughput is 1.38E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471471932611128E-002) differ by less than 4E-4 (1.768430569759616e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471471746130506E-002) differ by less than 4E-4 (1.792194693761573e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.481854e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.406796e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.468460e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.412048e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471475012321185E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471471641207505E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8373s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8327s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.36E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
+ [COUNTERS] PROGRAM TOTAL          :    0.8323s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8265s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 8.95E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0049s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471475012321185E-002) differ by less than 4E-4 (1.375968260441951e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471471641207505E-002) differ by less than 4E-4 (1.8055655381932212e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.717098e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.479157e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.890243e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.067147e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.313606e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.047251e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.232701e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.860004e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.300307e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.051348e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.230438e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.997681e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.193713e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.964172e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.247962e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.785109e+06                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index 2422d3068f..b21554372e 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
-
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 make USEBUILDDIR=1 BACKEND=cuda
 
+
+
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:05:53
+DATE: 2025-10-11_17:10:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 387 events (found 1591 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7391s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4096s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3295s for     8192 events => throughput is 2.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7553s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4138s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3415s for     8192 events => throughput is 2.40E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7165s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3855s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3310s for     8192 events => throughput is 2.47E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7268s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3875s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3393s for     8192 events => throughput is 2.41E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07847 [7.8471486590207584E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7396s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3874s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3510s for     8192 events => throughput is 2.33E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7475s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3883s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3580s for     8192 events => throughput is 2.29E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486590207584E-002) differ by less than 2E-4 (9.945765766516956e-09)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486590207584E-002) differ by less than 2E-4 (9.945765988561561e-09)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.409349e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.359867e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.415956e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.360283e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471486540430027E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471486557993325E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5676s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3876s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1792s for     8192 events => throughput is 4.57E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5750s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3921s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1821s for     8192 events => throughput is 4.50E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486540430027E-002) differ by less than 2E-4 (9.311426296676473e-09)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486557993325E-002) differ by less than 2E-4 (9.535244149816435e-09)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.653483e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.570903e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.691370e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.571774e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471486395956899E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4809s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3907s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0896s for     8192 events => throughput is 9.14E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4882s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3954s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0922s for     8192 events => throughput is 8.88E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486395956899E-002) differ by less than 2E-4 (7.470335683379403e-09)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.402724e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.192817e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.391101e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.186620e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471486395956899E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4660s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3858s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0796s for     8192 events => throughput is 1.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4787s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3937s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0844s for     8192 events => throughput is 9.71E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486395956899E-002) differ by less than 2E-4 (7.470335683379403e-09)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.055172e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.002954e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.066925e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.000380e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07847 [7.8471486537749241E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5026s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3850s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1170s for     8192 events => throughput is 7.00E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.5085s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3899s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1179s for     8192 events => throughput is 6.95E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277263846030337e-09)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277264068074942e-09)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.005425e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.931283e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.056979e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.899982e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485791426987E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471486543087457E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8432s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8306s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0095s for     8192 events => throughput is 8.66E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0031s
+ [COUNTERS] PROGRAM TOTAL          :    0.8420s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8352s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0014s for     8192 events => throughput is 5.93E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0055s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471485791426987E-002) differ by less than 2E-4 (2.334807902570901e-10)
+OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471486543087457E-002) differ by less than 2E-4 (9.345291429596614e-09)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.128450e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.941062e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.439893e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.043050e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.421024e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.003879e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.153444e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.219422e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.432988e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.007497e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.169695e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.367555e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.432146e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.012869e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.638179e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.798121e+06                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index 5517ab4292..fcf14d36a5 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:06:26
+DATE: 2025-10-11_17:10:42
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 223 events)
- [COUNTERS] PROGRAM TOTAL          :    4.6353s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3093s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.3260s for     8192 events => throughput is 1.89E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8675s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3041s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5634s for     8192 events => throughput is 1.80E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5825s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2904s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2921s for     8192 events => throughput is 1.91E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8255s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2969s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5287s for     8192 events => throughput is 1.81E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.7512s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2946s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.4476s for     8192 events => throughput is 1.84E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0090s
+ [COUNTERS] PROGRAM TOTAL          :    4.8499s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2944s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.5463s for     8192 events => throughput is 1.80E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0092s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.894558e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.855071e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.891638e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.864869e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6638s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2927s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3665s for     8192 events => throughput is 3.46E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0045s
+ [COUNTERS] PROGRAM TOTAL          :    2.8407s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2953s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.5401s for     8192 events => throughput is 3.23E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0053s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.547129e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.391185e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.542201e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.371248e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3332s
+ [COUNTERS] PROGRAM TOTAL          :    1.3634s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.2951s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0356s for     8192 events => throughput is 7.91E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0025s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0657s for     8192 events => throughput is 7.69E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0026s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -213,14 +220,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.118919e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.818945e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.114943e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.888581e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2181s
+ [COUNTERS] PROGRAM TOTAL          :    1.2373s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.2951s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9207s for     8192 events => throughput is 8.90E+03 events/s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9400s for     8192 events => throughput is 8.71E+03 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0022s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.276674e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.864841e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.241984e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.851817e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4646s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2912s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1708s for     8192 events => throughput is 7.00E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0026s
+ [COUNTERS] PROGRAM TOTAL          :    1.5242s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2959s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2254s for     8192 events => throughput is 6.69E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0029s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -303,14 +310,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.083404e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.755860e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.099846e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.706109e+03                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8110s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7374s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0388s for     8192 events => throughput is 2.11E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0348s
+ [COUNTERS] PROGRAM TOTAL          :    0.7754s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7315s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0192s for     8192 events => throughput is 4.26E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0246s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786561240192) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786561240197) differ by less than 3E-14 (0.0)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.149005e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.416533e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.350783e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.462010e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.129093e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.359331e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.172100e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.449399e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.126645e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.367790e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.170032e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.440795e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.144552e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.383135e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.426547e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.480569e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index 78567e12c9..5c635cc8ef 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 
 make USEBUILDDIR=1 BACKEND=cuda
-make USEBUILDDIR=1 BACKEND=cppnone
 
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:08:49
+DATE: 2025-10-11_17:12:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 223 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5864s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2940s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2923s for     8192 events => throughput is 1.91E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8704s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2988s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5716s for     8192 events => throughput is 1.79E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5924s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2886s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.3038s for     8192 events => throughput is 1.90E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8250s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2965s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5284s for     8192 events => throughput is 1.81E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144941544531159] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144941326459554] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.6210s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2941s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.3184s for     8192 events => throughput is 1.90E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0085s
+ [COUNTERS] PROGRAM TOTAL          :    4.7411s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2946s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.4378s for     8192 events => throughput is 1.85E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0087s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941544531159) differ by less than 4E-4 (4.675947774535061e-06)
+OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941326459554) differ by less than 4E-4 (4.669368411036601e-06)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.957206e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.908171e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.957921e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.916943e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144937378275385] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4924s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2933s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1966s for     8192 events => throughput is 6.85E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0025s
+ [COUNTERS] PROGRAM TOTAL          :    1.5212s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2931s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2254s for     8192 events => throughput is 6.68E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0027s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144937378275385) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.048957e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.792707e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.041651e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.847129e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8128s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2926s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5189s for     8192 events => throughput is 1.58E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8295s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2946s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5336s for     8192 events => throughput is 1.54E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0013s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.622272e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.560155e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.613287e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.556326e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7779s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2950s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4817s for     8192 events => throughput is 1.70E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
+ [COUNTERS] PROGRAM TOTAL          :    0.7790s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2954s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4823s for     8192 events => throughput is 1.70E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0013s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,14 +265,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.826080e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.756110e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.802534e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.758530e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144947551388249] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8771s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2920s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5836s for     8192 events => throughput is 1.40E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.9014s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2946s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6052s for     8192 events => throughput is 1.35E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144947551388249) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.430502e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.375609e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.421428e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.357712e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144955535316123] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144804761684321] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7866s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7350s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0271s for     8192 events => throughput is 3.02E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0245s
+ [COUNTERS] PROGRAM TOTAL          :    0.7725s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7390s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0108s for     8192 events => throughput is 7.56E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0227s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cuda (0.33144955535316123) differ by less than 4E-4 (5.0980589545446264e-06)
+OK! xsec from fortran (0.33144786561240197) and cuda (0.33144804761684321) differ by less than 4E-4 (5.491193642015446e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.089397e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.844164e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.388762e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.016020e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.126017e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.967323e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.254976e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.138637e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.087410e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.960156e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.221892e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.136855e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.084262e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.944572e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.392382e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.273692e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index 0f7d6f4131..2f61c77e8d 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -1,4 +1,7 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 
 make USEBUILDDIR=1 BACKEND=cuda
@@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda
 make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:07:37
+DATE: 2025-10-11_17:11:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 223 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5989s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2922s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.3067s for     8192 events => throughput is 1.90E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8471s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2979s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5492s for     8192 events => throughput is 1.80E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.6012s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2899s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.3113s for     8192 events => throughput is 1.90E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8278s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2989s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5289s for     8192 events => throughput is 1.81E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786734542164] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.8059s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2941s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.5027s for     8192 events => throughput is 1.82E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.9193s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2946s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.6155s for     8192 events => throughput is 1.77E+03 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0091s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786734542164) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.881337e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.840344e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.867505e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.842142e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786651655289] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6829s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2920s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3862s for     8192 events => throughput is 3.43E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0047s
+ [COUNTERS] PROGRAM TOTAL          :    2.7307s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2968s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4288s for     8192 events => throughput is 3.37E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0050s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786651655289) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.548157e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.428088e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.537868e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.464566e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3285s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2936s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0325s for     8192 events => throughput is 7.93E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0024s
+ [COUNTERS] PROGRAM TOTAL          :    1.3474s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2970s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0479s for     8192 events => throughput is 7.82E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0025s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -213,14 +220,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.171504e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.942226e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.183239e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.692396e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.1999s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2927s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9051s for     8192 events => throughput is 9.05E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.2106s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2946s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9138s for     8192 events => throughput is 8.96E+03 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0022s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.165581e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.272414e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.350878e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.142833e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4750s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2928s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1793s for     8192 events => throughput is 6.95E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0029s
+ [COUNTERS] PROGRAM TOTAL          :    1.5269s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3007s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2234s for     8192 events => throughput is 6.70E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0028s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -303,14 +310,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.035517e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.830218e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.843003e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.809509e+03                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144786533876569] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144786716305458] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8136s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7401s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0389s for     8192 events => throughput is 2.11E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0347s
+ [COUNTERS] PROGRAM TOTAL          :    0.7808s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7376s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0192s for     8192 events => throughput is 4.27E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0240s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786533876569) differ by less than 2E-4 (8.255786054789382e-10)
+OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786716305458) differ by less than 2E-4 (4.6784207619055e-09)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.142259e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.383309e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.350796e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.484069e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.127674e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.409887e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.154284e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.456801e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.123213e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.362526e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.173815e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.463078e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.121978e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.357037e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.416494e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.491061e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index 74862dd5f7..fe6b10b3d3 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 
-make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppnone
-
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:10:25
+DATE: 2025-10-11_17:13:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  100.9475s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5365s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.4109s for     8192 events => throughput is 8.16E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.2505s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5363s
+ [COUNTERS] Fortran MEs      ( 1 ) :  101.7141s for     8192 events => throughput is 8.05E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  100.8105s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5296s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.2810s for     8192 events => throughput is 8.17E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.2069s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5320s
+ [COUNTERS] Fortran MEs      ( 1 ) :  101.6749s for     8192 events => throughput is 8.06E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  127.1376s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5284s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  126.4018s for     8192 events => throughput is 6.48E+01 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.2074s
+ [COUNTERS] PROGRAM TOTAL          :  128.7427s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5353s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  127.9956s for     8192 events => throughput is 6.40E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.2118s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282475E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.678586e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.580483e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.694101e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.620995e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   61.7097s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5331s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   61.0765s for     8192 events => throughput is 1.34E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1001s
+ [COUNTERS] PROGRAM TOTAL          :   69.6189s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5284s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   68.9781s for     8192 events => throughput is 1.19E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1125s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,14 +175,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.591189e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.424482e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.580161e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.419676e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   29.3577s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5263s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   28.7837s for     8192 events => throughput is 2.85E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0477s
+ [COUNTERS] PROGRAM TOTAL          :   30.3572s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5354s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   29.7726s for     8192 events => throughput is 2.75E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0492s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -213,14 +220,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.407090e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.296671e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.415212e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.296231e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   26.2469s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5271s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.6788s for     8192 events => throughput is 3.19E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0410s
+ [COUNTERS] PROGRAM TOTAL          :   26.8666s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5340s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.2902s for     8192 events => throughput is 3.12E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0424s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,14 +265,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.913687e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.796432e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.895964e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.783837e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   26.1607s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5255s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.5871s for     8192 events => throughput is 3.20E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0481s
+ [COUNTERS] PROGRAM TOTAL          :   27.2211s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5330s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.6390s for     8192 events => throughput is 3.08E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0491s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -303,14 +310,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.408791e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.322007e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.444614e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.342992e+02                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.357e-07 [2.3572561551282422E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :    3.3131s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1215s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1040s for     8192 events => throughput is 7.42E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    1.0875s
+ [COUNTERS] PROGRAM TOTAL          :    2.0387s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0768s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6155s for     8192 events => throughput is 1.33E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.3464s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561551282475E-007) differ by less than 3E-14 (2.4424906541753444e-15)
+OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561551282422E-007) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.491511e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.336265e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.275455e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.298842e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.282089e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.363941e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.552042e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.311264e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.301465e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.338602e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.448921e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.323398e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.252906e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.336359e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.241973e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.336023e+03                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index bfa4b4cda4..da0706ada3 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 
 make USEBUILDDIR=1 BACKEND=cppnone
 
-
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:42:40
+DATE: 2025-10-11_17:46:23
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  100.8152s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5282s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.2871s for     8192 events => throughput is 8.17E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.9219s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5367s
+ [COUNTERS] Fortran MEs      ( 1 ) :  102.3853s for     8192 events => throughput is 8.00E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  100.7247s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5322s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.1925s for     8192 events => throughput is 8.18E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.9948s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5407s
+ [COUNTERS] Fortran MEs      ( 1 ) :  102.4541s for     8192 events => throughput is 8.00E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -100,7 +107,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 128/128
@@ -108,30 +114,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.358e-07 [2.3575849446922190E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.358e-07 [2.3575849511111252E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  112.7914s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5240s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  112.0829s for     8192 events => throughput is 7.31E+01 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1845s
+ [COUNTERS] PROGRAM TOTAL          :  116.5594s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5371s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  115.8332s for     8192 events => throughput is 7.07E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1891s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849446922190E-007) differ by less than 4E-4 (0.00013947977747852391)
+OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849511111252E-007) differ by less than 4E-4 (0.00013948250052009392)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.631916e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.535383e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.625132e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.441970e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -146,7 +152,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 128/128
@@ -156,10 +161,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.358e-07 [2.3575845178322101E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   28.7980s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5271s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   28.2235s for     8192 events => throughput is 2.90E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0474s
+ [COUNTERS] PROGRAM TOTAL          :   31.5456s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5700s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   30.9224s for     8192 events => throughput is 2.65E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0531s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -170,14 +175,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845178322101E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.386203e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.071038e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.374145e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.043650e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -192,7 +197,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 128/128
@@ -202,10 +206,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   14.8120s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5245s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   14.2638s for     8192 events => throughput is 5.74E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0236s
+ [COUNTERS] PROGRAM TOTAL          :   15.3844s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5370s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   14.8227s for     8192 events => throughput is 5.53E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0247s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -216,14 +220,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.872770e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.685687e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.864576e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.672269e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -238,7 +242,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 128/128
@@ -248,10 +251,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   13.3091s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5262s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.7618s for     8192 events => throughput is 6.42E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0211s
+ [COUNTERS] PROGRAM TOTAL          :   13.6990s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5329s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   13.1447s for     8192 events => throughput is 6.23E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0214s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -262,14 +265,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.728743e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.552784e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.768099e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.581015e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -284,7 +287,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 128/128
@@ -294,10 +296,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.358e-07 [2.3575850859831750E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   13.2286s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5280s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.6780s for     8192 events => throughput is 6.46E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0225s
+ [COUNTERS] PROGRAM TOTAL          :   13.9360s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5476s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   13.3630s for     8192 events => throughput is 6.13E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0254s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -308,14 +310,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575850859831750E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.948019e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.686443e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.969717e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.667526e+02                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -337,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.358e-07 [2.3575862304433055E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.357e-07 [2.3572568120113116E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2079s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1084s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5470s for     8192 events => throughput is 1.50E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.5524s
+ [COUNTERS] PROGRAM TOTAL          :    1.5254s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0122s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2322s for     8192 events => throughput is 3.53E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.2811s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3575862304433055E-007) differ by less than 4E-4 (0.00014002522141920437)
+OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572568120113116E-007) differ by less than 4E-4 (2.78664271879947e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.517499e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.547134e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.545233e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.607921e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.140576e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.571279e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.181453e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.601694e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.126165e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.579531e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.164632e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.607459e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.163932e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.584591e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.073078e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.996351e+03                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index 3a68950921..972fcc6999 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -1,4 +1,7 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 
 make USEBUILDDIR=1 BACKEND=cuda
@@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda
 make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make USEBUILDDIR=1 BACKEND=cppavx2
 
+make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:26:37
+DATE: 2025-10-11_17:30:19
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  101.1381s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5302s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.6080s for     8192 events => throughput is 8.14E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.1691s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5343s
+ [COUNTERS] Fortran MEs      ( 1 ) :  101.6348s for     8192 events => throughput is 8.06E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  100.8808s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5357s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.3451s for     8192 events => throughput is 8.16E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.2057s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5327s
+ [COUNTERS] Fortran MEs      ( 1 ) :  101.6729s for     8192 events => throughput is 8.06E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561678995975E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  123.7239s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5356s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  122.9787s for     8192 events => throughput is 6.66E+01 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.2095s
+ [COUNTERS] PROGRAM TOTAL          :  130.3996s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5377s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  129.6472s for     8192 events => throughput is 6.32E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.2147s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561678995975E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.634632e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.490256e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.608909e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.489525e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561701257335E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   64.5975s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5274s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   63.9661s for     8192 events => throughput is 1.28E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1041s
+ [COUNTERS] PROGRAM TOTAL          :   64.8540s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5288s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   64.2213s for     8192 events => throughput is 1.28E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1039s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,14 +175,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561701257335E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.549992e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.563988e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.544779e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.529721e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   28.6856s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5254s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   28.1150s for     8192 events => throughput is 2.91E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0453s
+ [COUNTERS] PROGRAM TOTAL          :   28.8286s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5327s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   28.2496s for     8192 events => throughput is 2.90E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0463s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -213,14 +220,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.581303e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.534195e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.574698e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.569719e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   24.6205s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5315s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   24.0503s for     8192 events => throughput is 3.41E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0387s
+ [COUNTERS] PROGRAM TOTAL          :   26.1574s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5395s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   25.5773s for     8192 events => throughput is 3.20E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0406s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,14 +265,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.161373e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.054403e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.184852e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.039174e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   25.7441s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5280s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.1699s for     8192 events => throughput is 3.25E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0462s
+ [COUNTERS] PROGRAM TOTAL          :   26.7057s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5352s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.1230s for     8192 events => throughput is 3.14E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0475s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -303,14 +310,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.516660e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.438352e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.515216e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.447842e+02                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.357e-07 [2.3572561518129465E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.357e-07 [2.3572561670766515E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8461s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.0822s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8795s for     8192 events => throughput is 9.31E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.8844s
+ [COUNTERS] PROGRAM TOTAL          :    1.8201s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0131s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4965s for     8192 events => throughput is 1.65E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.3105s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561518129465E-007) differ by less than 2E-4 (1.4064212017217415e-09)
+OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561670766515E-007) differ by less than 2E-4 (5.0687787300773834e-09)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.415473e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.664884e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.080771e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.607592e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.106752e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.667090e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.156598e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.595955e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.106849e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.655497e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.103409e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.622539e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111142e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.675870e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.667428e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.460940e+03                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index 7310cfc72a..7c2d5d02c8 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:09:42
+DATE: 2025-10-11_17:13:08
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 506 events (found 1943 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5319s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4597s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0722s for     8192 events => throughput is 1.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5482s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4745s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0736s for     8192 events => throughput is 1.11E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4765s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4047s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0718s for     8192 events => throughput is 1.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4930s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4192s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0739s for     8192 events => throughput is 1.11E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737132] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4865s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4077s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0781s for     8192 events => throughput is 1.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4901s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4103s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0791s for     8192 events => throughput is 1.04E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737132) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073164e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.055904e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.079140e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.064104e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4492s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4062s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0425s for     8192 events => throughput is 1.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4528s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4081s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0441s for     8192 events => throughput is 1.86E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.895347e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.868596e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.917908e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.882630e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4396s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4134s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0257s for     8192 events => throughput is 3.19E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4341s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4076s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0260s for     8192 events => throughput is 3.16E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.340027e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.217719e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.307491e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.250909e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4310s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4082s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0224s for     8192 events => throughput is 3.66E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4367s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4117s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0245s for     8192 events => throughput is 3.34E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.693677e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.377107e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.718907e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.445554e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4438s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4093s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0340s for     8192 events => throughput is 2.41E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4456s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4100s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0350s for     8192 events => throughput is 2.34E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.386493e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.314404e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.395890e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.349276e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737173] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8495s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8451s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.52E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
+ [COUNTERS] PROGRAM TOTAL          :    0.8613s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8556s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 1.03E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0049s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -348,44 +355,44 @@ OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504505737173) diffe
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.777000e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.568159e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.265214e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.455155e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.327919e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.192502e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.161258e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.014422e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.316740e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.214633e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.319766e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.430009e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.323054e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.226812e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.646948e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.646817e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 748c92b28c..2376b74b06 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+
 
 make USEBUILDDIR=1 BACKEND=cuda
+make USEBUILDDIR=1 BACKEND=cppnone
 
 
-make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:10:11
+DATE: 2025-10-11_17:13:38
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 506 events (found 1943 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5240s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4523s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0718s for     8192 events => throughput is 1.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5325s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4601s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0724s for     8192 events => throughput is 1.13E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4796s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4074s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0721s for     8192 events => throughput is 1.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4871s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4143s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0728s for     8192 events => throughput is 1.13E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313506133732837] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4786s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4057s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0723s for     8192 events => throughput is 1.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4843s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4086s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0751s for     8192 events => throughput is 1.09E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313506133732837) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.132089e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.108850e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.123977e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.108803e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313502997679400] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4346s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4073s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0269s for     8192 events => throughput is 3.04E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4377s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4101s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0272s for     8192 events => throughput is 3.01E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502997679400) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.016574e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.944992e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.049161e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.961979e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4231s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4098s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0130s for     8192 events => throughput is 6.28E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4227s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4085s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0138s for     8192 events => throughput is 5.95E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.215183e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.824085e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.201945e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.049332e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4177s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4051s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0122s for     8192 events => throughput is 6.69E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4225s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4090s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0132s for     8192 events => throughput is 6.21E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.557168e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.355595e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.659565e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.395017e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313505300145301] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4231s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4064s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0163s for     8192 events => throughput is 5.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4271s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4088s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0179s for     8192 events => throughput is 4.58E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313505300145301) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.736521e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.628365e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.799657e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.648318e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313508590887899] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313508404553540] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8496s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8457s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.64E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
+ [COUNTERS] PROGRAM TOTAL          :    0.8566s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8514s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.16E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0044s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cuda (0.20313508590887899) differ by less than 4E-4 (2.011051698502797e-07)
+OK! xsec from fortran (0.20313504505737126) and cuda (0.20313508404553540) differ by less than 4E-4 (1.9193223965707773e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.049327e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.202405e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.339018e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.296000e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.110522e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.115794e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.423874e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.024681e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.090502e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.134420e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.757351e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.104635e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.720065e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.797328e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.206204e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.751422e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index dd13a39319..cf138d100f 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
-
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 
+
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:09:56
+DATE: 2025-10-11_17:13:23
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 506 events (found 1943 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5254s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4537s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0717s for     8192 events => throughput is 1.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5311s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4584s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0727s for     8192 events => throughput is 1.13E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4842s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4128s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0714s for     8192 events => throughput is 1.15E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4848s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4122s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0726s for     8192 events => throughput is 1.13E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504495344831] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4899s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4117s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0776s for     8192 events => throughput is 1.06E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.4868s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4073s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0788s for     8192 events => throughput is 1.04E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344831) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073352e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.054873e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073996e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.059290e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504495344833] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504500016025] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4513s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4086s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0421s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4535s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4098s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0431s for     8192 events => throughput is 1.90E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344833) differ by less than 2E-4 (5.115952106393706e-10)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504500016025) differ by less than 2E-4 (2.816402666638851e-10)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.886911e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.896659e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.898728e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.911870e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4424s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4165s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0255s for     8192 events => throughput is 3.22E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4326s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4072s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0250s for     8192 events => throughput is 3.28E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.243245e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.285561e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.311888e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.331125e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4308s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4086s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0217s for     8192 events => throughput is 3.78E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4323s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4081s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0238s for     8192 events => throughput is 3.44E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.793279e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.491118e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.775522e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.400822e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4486s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4131s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0350s for     8192 events => throughput is 2.34E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4453s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4096s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0351s for     8192 events => throughput is 2.33E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.316706e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.392779e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.334216e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.391910e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504512110778] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504511630270] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8511s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8469s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.63E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
+ [COUNTERS] PROGRAM TOTAL          :    0.8562s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8507s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 1.04E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0047s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504512110778) differ by less than 2E-4 (3.1376434783680907e-10)
+OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504511630270) differ by less than 2E-4 (2.9010971402954056e-10)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.929266e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.558045e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.319589e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.456934e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.340652e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.187313e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.169068e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.035767e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.326566e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.212826e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.337296e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.409792e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.337938e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.225960e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.656612e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.646014e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
index d2a669114e..2e04a004a3 100644
--- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
-
-make USEBUILDDIR=1 BACKEND=cuda
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
+
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:54:40
+DATE: 2025-10-11_17:58:37
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 3371 events (found 6399 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9766s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.9291s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0475s for     8192 events => throughput is 1.72E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.0898s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0409s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0488s for     8192 events => throughput is 1.68E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4581s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4105s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0476s for     8192 events => throughput is 1.72E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4945s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4458s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0487s for     8192 events => throughput is 1.68E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755170] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4592s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4086s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0501s for     8192 events => throughput is 1.63E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
+ [COUNTERS] PROGRAM TOTAL          :    0.5064s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4538s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0521s for     8192 events => throughput is 1.57E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755170) differ b
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.648377e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.624855e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.642355e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.621541e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4344s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4065s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0275s for     8192 events => throughput is 2.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4797s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4512s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0281s for     8192 events => throughput is 2.91E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755183) differ b
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.984151e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.925389e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.017550e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.958081e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4261s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4086s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0171s for     8192 events => throughput is 4.80E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4709s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4533s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0173s for     8192 events => throughput is 4.75E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ b
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.938014e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.831423e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.942444e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.833351e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4299s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4143s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0153s for     8192 events => throughput is 5.37E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4705s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4537s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0165s for     8192 events => throughput is 4.97E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ b
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.398535e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.130791e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.466636e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.171570e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755179] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4391s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4149s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0238s for     8192 events => throughput is 3.45E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
+ [COUNTERS] PROGRAM TOTAL          :    0.4789s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4536s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0248s for     8192 events => throughput is 3.30E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -303,14 +310,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755179) differ b
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.480162e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.370093e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.526547e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.372925e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081479755192] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081479755196] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8532s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8493s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.62E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
+ [COUNTERS] PROGRAM TOTAL          :    0.8974s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8926s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.14E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0041s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081479755192) differ by less than 3E-14 (4.440892098500626e-16)
+OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081479755196) differ by less than 3E-14 (6.661338147750939e-16)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.920216e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.725729e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.457557e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.044433e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.816989e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.665417e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.149758e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.597159e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.802618e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.632530e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.511448e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.850879e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.832166e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.607978e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.514724e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.211181e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
index 483bc4166c..b05e5697ad 100644
--- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
-make USEBUILDDIR=1 BACKEND=cuda
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:55:09
+DATE: 2025-10-11_17:59:08
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 3371 events (found 6399 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9638s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.9156s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0482s for     8192 events => throughput is 1.70E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.0937s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0443s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0494s for     8192 events => throughput is 1.66E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4563s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4085s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0478s for     8192 events => throughput is 1.71E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4992s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4492s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0500s for     8192 events => throughput is 1.64E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -107,26 +114,27 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160406825242951] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160406822335140] fbridge_mode=1
  [UNWEIGHT] Wrote 1653 events (found 1658 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4552s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4076s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0473s for     8192 events => throughput is 1.73E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5029s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4535s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0491s for     8192 events => throughput is 1.67E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160406825242951) differ by less than 4E-4 (1.6138103811513815e-05)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160406822335140) differ by less than 4E-4 (1.613795957533526e-05)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ!
-diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20
-7562,7575d7561
-< 4 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00
+diff /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20
+8102,8116d8101
+< 5 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00
 <          21   -1    0    0  503  502  0.00000000000E+00  0.00000000000E+00  0.71320499473E+02  0.71320499473E+02  0.00000000000E+00 0.  1.
 <          21   -1    0    0  502  503 -0.00000000000E+00 -0.00000000000E+00 -0.54771239790E+02  0.54771239790E+02  0.00000000000E+00 0.  1.
-<           5    1    1    2  501    0  0.50303102232E+02  0.36190119942E+02  0.14973002893E+02  0.63925016162E+02  0.47000000000E+01 0. -1.
-<          -5    1    1    2    0  501 -0.50303102232E+02 -0.36190119942E+02  0.15762567893E+01  0.62166723101E+02  0.47000000000E+01 0. -1.
+<          25    2    1    2    0    0  0.00000000000E+00  0.00000000000E+00  0.16549259682E+02  0.12609173926E+03  0.12500099485E+03 0.  0.
+<           5    1    3    3  501    0  0.50303102232E+02  0.36190119942E+02  0.14973002893E+02  0.63925016162E+02  0.47000000000E+01 0. -1.
+<          -5    1    3    3    0  501 -0.50303102232E+02 -0.36190119942E+02  0.15762567893E+01  0.62166723101E+02  0.47000000000E+01 0. -1.
 < <mgrwt>
 < <rscale>  0 0.12500099E+03</rscale>
 < <asrwt>0</asrwt>
diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
index b61563e796..a81624efdc 100644
--- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
-
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
+
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:54:54
+DATE: 2025-10-11_17:58:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 3371 events (found 6399 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9594s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.9118s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0475s for     8192 events => throughput is 1.72E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.0919s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0436s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0483s for     8192 events => throughput is 1.70E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4589s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4111s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0478s for     8192 events => throughput is 1.71E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4974s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4479s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0494s for     8192 events => throughput is 1.66E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -100,7 +107,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 16/16
@@ -108,33 +114,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081964453331] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081963935692] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4600s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4089s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0507s for     8192 events => throughput is 1.62E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
+ [COUNTERS] PROGRAM TOTAL          :    0.5020s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4502s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0513s for     8192 events => throughput is 1.60E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964453331) differ by less than 2E-4 (2.4042469792817656e-08)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081963935692) differ by less than 2E-4 (2.401679322083794e-08)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.539881e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.533252e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.532971e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.529423e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -149,7 +152,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 16/16
@@ -157,33 +159,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081964453336] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081964477738] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4363s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4080s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0279s for     8192 events => throughput is 2.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4812s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4523s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0285s for     8192 events => throughput is 2.88E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964453336) differ by less than 2E-4 (2.404247001486226e-08)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964477738) differ by less than 2E-4 (2.4043680380003707e-08)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.824636e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.789074e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.869373e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.799101e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -198,7 +197,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 16/16
@@ -206,33 +204,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081962974745] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4311s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4138s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0169s for     8192 events => throughput is 4.85E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4709s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4532s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0173s for     8192 events => throughput is 4.73E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962974745) differ by less than 2E-4 (2.3969127349587893e-08)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.809707e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.670071e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.724204e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.743283e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -247,7 +242,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 16/16
@@ -255,33 +249,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081962974745] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4252s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4093s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0155s for     8192 events => throughput is 5.28E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4728s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4554s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0171s for     8192 events => throughput is 4.80E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962974745) differ by less than 2E-4 (2.3969127349587893e-08)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.163712e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.832111e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.204514e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.036692e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -296,7 +287,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 16/16
@@ -304,33 +294,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081962970020] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081981445623] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4306s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4058s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0244s for     8192 events => throughput is 3.36E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4774s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4523s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0246s for     8192 events => throughput is 3.32E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962970020) differ by less than 2E-4 (2.3968893092529697e-08)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981445623) differ by less than 2E-4 (2.4885338012481384e-08)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.121651e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.244912e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.119023e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.260859e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -352,60 +339,60 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081483021330] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081952642219] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8574s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8536s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.63E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
+ [COUNTERS] PROGRAM TOTAL          :    0.9023s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8974s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.15E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0042s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081483021330) differ by less than 2E-4 (1.6201062713605552e-10)
+OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081952642219) differ by less than 2E-4 (2.345660332636612e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.018963e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.648200e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.363694e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.088314e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.820757e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.635192e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.067644e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.596149e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.797704e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.579204e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.465309e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.870733e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.821262e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.605252e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.503862e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.211048e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
index d3cb91b8cd..ee647bf095 100644
--- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:56:37
+DATE: 2025-10-11_18:00:38
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 902 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6766s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3643s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3123s for     8192 events => throughput is 3.54E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7275s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3706s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3569s for     8192 events => throughput is 3.48E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6640s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3633s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3007s for     8192 events => throughput is 3.56E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7259s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3684s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3575s for     8192 events => throughput is 3.47E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8505s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3633s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4822s for     8192 events => throughput is 3.30E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0050s
+ [COUNTERS] PROGRAM TOTAL          :    2.8149s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3695s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4402s for     8192 events => throughput is 3.36E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0051s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.457369e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.441343e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.441555e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.445366e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381610362728610E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6655s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3645s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2984s for     8192 events => throughput is 6.31E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0027s
+ [COUNTERS] PROGRAM TOTAL          :    1.7137s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3713s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.3396s for     8192 events => throughput is 6.12E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0028s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728610E-007) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728610E-007) differ by less than 3E-14 (4.440892098500626e-16)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.514132e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.351156e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.544925e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.406951e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9435s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3668s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5751s for     8192 events => throughput is 1.42E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
+ [COUNTERS] PROGRAM TOTAL          :    0.9625s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3707s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5902s for     8192 events => throughput is 1.39E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0016s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.460459e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.435538e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.466853e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.436593e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8804s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3647s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5144s for     8192 events => throughput is 1.59E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.9044s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3692s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5338s for     8192 events => throughput is 1.53E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0014s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.641494e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.541883e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.655223e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.588675e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    1.0440s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3665s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6757s for     8192 events => throughput is 1.21E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0018s
+ [COUNTERS] PROGRAM TOTAL          :    1.0751s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3693s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7040s for     8192 events => throughput is 1.16E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0019s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.221115e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.193272e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.225553e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.191231e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -334,58 +341,58 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8457s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8061s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0199s for     8192 events => throughput is 4.13E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0198s
+ [COUNTERS] PROGRAM TOTAL          :    0.8448s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8136s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0125s for     8192 events => throughput is 6.56E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0187s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381610362728578E-007) differ by less than 3E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381610362728578E-007) differ by less than 3E-14 (0.0)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.230611e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.695448e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.541816e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.925847e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.854537e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.997799e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.229320e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.170285e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.859903e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.983419e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.225591e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.128334e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.850975e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.982511e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.687847e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.328429e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
index 10c15cf9d1..1cc58a2dd1 100644
--- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+
 
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:58:07
+DATE: 2025-10-11_18:02:03
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 902 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6755s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3603s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3152s for     8192 events => throughput is 3.54E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7018s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3625s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3393s for     8192 events => throughput is 3.50E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6754s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3698s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3056s for     8192 events => throughput is 3.55E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7141s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3681s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3460s for     8192 events => throughput is 3.49E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381686438954397E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381686359952968E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8067s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3659s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4358s for     8192 events => throughput is 3.36E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0050s
+ [COUNTERS] PROGRAM TOTAL          :    2.7333s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3691s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3595s for     8192 events => throughput is 3.47E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0047s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381686438954397E-007) differ by less than 4E-4 (9.960018576560259e-07)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686359952968E-007) differ by less than 4E-4 (9.949675585652074e-07)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.485505e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.581994e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.473644e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.595398e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381671483253128E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    1.0546s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3688s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6842s for     8192 events => throughput is 1.20E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.0796s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3702s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7079s for     8192 events => throughput is 1.16E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381671483253128E-007) differ by less than 4E-4 (8.001994753481512e-07)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381671483253128E-007) differ by less than 4E-4 (8.001994755701958e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.232148e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.209114e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.242719e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.211724e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6626s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3670s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2947s for     8192 events => throughput is 2.78E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
+ [COUNTERS] PROGRAM TOTAL          :    0.6741s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3720s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3011s for     8192 events => throughput is 2.72E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0010s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.866680e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.778595e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.814611e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.785996e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6345s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3672s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2664s for     8192 events => throughput is 3.07E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6455s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3705s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2742s for     8192 events => throughput is 2.99E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.183014e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.038472e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.199503e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.060001e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381686320975603E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7045s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3656s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3379s for     8192 events => throughput is 2.42E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7218s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3694s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3514s for     8192 events => throughput is 2.33E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0010s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381686320975603E-007) differ by less than 4E-4 (9.944572607611946e-07)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686320975603E-007) differ by less than 4E-4 (9.944572609832392e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.460974e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.367267e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.436294e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.356404e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381711031958629E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381615491789429E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8419s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8049s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0197s for     8192 events => throughput is 4.15E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0172s
+ [COUNTERS] PROGRAM TOTAL          :    0.8351s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8093s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0076s for     8192 events => throughput is 1.08E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0182s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381711031958629E-007) differ by less than 4E-4 (1.3179773188376487e-06)
+OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381615491789429E-007) differ by less than 4E-4 (6.715046763083876e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.233915e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.138586e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.454452e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.179241e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.300238e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.224464e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.323216e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.249728e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.294935e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.225890e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.322990e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.250555e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.292471e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.220840e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.654983e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.651149e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
index 9cff3d3d2c..2ca786964c 100644
--- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
-
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:57:22
+DATE: 2025-10-11_18:01:20
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 902 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6661s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3588s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3072s for     8192 events => throughput is 3.55E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7267s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3648s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3619s for     8192 events => throughput is 3.47E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6664s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3632s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3031s for     8192 events => throughput is 3.56E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7387s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3721s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3666s for     8192 events => throughput is 3.46E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381608764955655E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8757s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3651s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.5054s for     8192 events => throughput is 3.27E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.8711s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3762s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4897s for     8192 events => throughput is 3.29E+03 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0052s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293319738268e-08)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293208715966e-08)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.427512e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.387716e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.426484e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.386658e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381608686521600E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6394s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3662s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2706s for     8192 events => throughput is 6.45E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0027s
+ [COUNTERS] PROGRAM TOTAL          :    1.6908s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3716s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.3164s for     8192 events => throughput is 6.22E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0028s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164241365944e-08)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164130343642e-08)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.733385e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.591306e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.780255e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.584653e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9411s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3649s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5747s for     8192 events => throughput is 1.43E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
+ [COUNTERS] PROGRAM TOTAL          :    0.9663s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3722s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5924s for     8192 events => throughput is 1.38E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0016s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.446717e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.420848e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.473262e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.429579e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8685s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3656s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5015s for     8192 events => throughput is 1.63E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.9022s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3723s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5284s for     8192 events => throughput is 1.55E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0014s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.681650e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.602337e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.668117e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.607376e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    1.0574s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3699s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6857s for     8192 events => throughput is 1.19E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0018s
+ [COUNTERS] PROGRAM TOTAL          :    1.0826s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3723s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7085s for     8192 events => throughput is 1.16E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0019s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.232369e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.176853e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.216790e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.176159e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610372590318E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381608867927968E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8397s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8000s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0198s for     8192 events => throughput is 4.13E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0198s
+ [COUNTERS] PROGRAM TOTAL          :    0.8465s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8152s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0125s for     8192 events => throughput is 6.53E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0188s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381610372590318E-007) differ by less than 2E-4 (1.2911138824733825e-10)
+OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381608867927968E-007) differ by less than 2E-4 (1.9570163600768353e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.219575e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.668728e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.527801e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.889186e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.836972e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.020522e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.176072e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.111985e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.835271e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.014502e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.206917e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.139379e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.823749e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.980651e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.671807e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.329147e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
index f18eaf3551..869ed226f5 100644
--- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:55:56
+DATE: 2025-10-11_17:59:56
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1732 events (found 4297 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6925s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6838s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0087s for     8192 events => throughput is 9.43E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7024s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6938s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.48E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4263s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4177s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.62E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4256s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4169s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0087s for     8192 events => throughput is 9.46E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4276s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4188s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0084s for     8192 events => throughput is 9.73E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4378s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4280s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0094s for     8192 events => throughput is 8.69E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.916439e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.191014e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.017065e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.282907e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4290s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4241s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0046s for     8192 events => throughput is 1.79E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4316s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4266s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0047s for     8192 events => throughput is 1.75E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.913729e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.860989e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.928329e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.909431e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4231s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4198s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.80E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
+ [COUNTERS] PROGRAM TOTAL          :    0.4296s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4263s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.79E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -213,14 +220,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.118646e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.006727e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.327279e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.109595e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4242s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4212s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.07E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4313s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4281s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.87E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,14 +265,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.142389e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.041656e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.418661e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.245400e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4276s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4241s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.67E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4344s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4307s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.48E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.810680e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.847128e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.123505e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.978037e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 2
  [XSECTION] ChannelId = 3
- [XSECTION] Cross section = 0.3045 [0.30449452343426109] fbridge_mode=1
+ [XSECTION] Cross section = 0.3045 [0.30449452343426103] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8704s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8668s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.69E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.8657s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8616s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.19E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0035s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452343426109) differ by less than 3E-14 (3.3306690738754696e-16)
+OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452343426103) differ by less than 3E-14 (5.551115123125783e-16)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.094441e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.369013e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.576690e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.148244e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.540792e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.850459e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.885377e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.711716e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.486109e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.810975e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.914518e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.845473e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.512059e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.786901e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.224875e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.505596e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
index 9cee2ab297..290a3c86d1 100644
--- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:56:23
+DATE: 2025-10-11_18:00:24
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1732 events (found 4297 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6965s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6879s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6996s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6911s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.67E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4263s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4177s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0087s for     8192 events => throughput is 9.43E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4259s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4174s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.55E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449446496609361] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4268s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4183s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0082s for     8192 events => throughput is 9.95E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4354s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4265s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0086s for     8192 events => throughput is 9.52E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446496609361) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.006620e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.988834e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.012762e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.001217e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449446369440458] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4190s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4159s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.94E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4277s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4247s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.97E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446369440458) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.282555e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.265266e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.369793e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.237148e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4206s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4183s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0020s for     8192 events => throughput is 4.08E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4268s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4247s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0019s for     8192 events => throughput is 4.33E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.872977e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.015677e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.148892e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.231737e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4201s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4180s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0018s for     8192 events => throughput is 4.44E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4273s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4252s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0019s for     8192 events => throughput is 4.39E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.886846e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.231045e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.506416e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.443837e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449447031649013] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4202s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4176s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.54E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4294s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4268s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.60E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449447031649013) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.376595e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.280248e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.863933e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.772169e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 2
  [XSECTION] ChannelId = 3
- [XSECTION] Cross section = 0.3045 [0.30449447352014630] fbridge_mode=1
+ [XSECTION] Cross section = 0.3045 [0.30449447192383194] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8576s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8540s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.64E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.8794s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8751s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.15E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0036s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.30449452343426120) and cuda (0.30449447352014630) differ by less than 4E-4 (1.639245078566276e-07)
+OK! xsec from fortran (0.30449452343426120) and cuda (0.30449447192383194) differ by less than 4E-4 (1.6916701384150912e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.209039e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.023525e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.497762e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.499953e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.599688e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.571654e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.103544e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.545216e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.606706e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.440681e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.131283e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.320302e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.229812e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.015605e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.664371e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.300602e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
index 782fee34a5..54eb3e1a6f 100644
--- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
-
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
 
+
 make USEBUILDDIR=1 BACKEND=cppsse4
-make USEBUILDDIR=1 BACKEND=cppavx2
 
+make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:56:09
+DATE: 2025-10-11_18:00:10
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1732 events (found 4297 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6953s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6866s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6912s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6825s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0088s for     8192 events => throughput is 9.35E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4236s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4152s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.68E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4267s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4180s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0087s for     8192 events => throughput is 9.44E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4277s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4187s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0086s for     8192 events => throughput is 9.54E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4348s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4250s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0094s for     8192 events => throughput is 8.68E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.831908e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.020488e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.918457e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.158136e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4287s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4239s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0045s for     8192 events => throughput is 1.83E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4307s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4256s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0047s for     8192 events => throughput is 1.75E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.892977e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.944164e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.974211e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.990329e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4216s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4183s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.84E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
+ [COUNTERS] PROGRAM TOTAL          :    0.4315s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4283s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.89E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -213,14 +220,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.237521e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.282930e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.477152e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.189855e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4270s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4239s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.00E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4314s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4283s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.02E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.311234e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.114512e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.507028e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.432567e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4247s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4212s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0032s for     8192 events => throughput is 2.59E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4300s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4264s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0032s for     8192 events => throughput is 2.53E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.926715e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.966860e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.198931e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.100849e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 2
  [XSECTION] ChannelId = 3
- [XSECTION] Cross section = 0.3045 [0.30449452360186230] fbridge_mode=1
+ [XSECTION] Cross section = 0.3045 [0.30449453231638185] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8627s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8591s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.69E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.8660s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8619s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.21E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0035s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452360186230) differ by less than 2E-4 (5.504239286580059e-10)
+OK! xsec from fortran (0.30449452343426120) and cuda (0.30449453231638185) differ by less than 2E-4 (2.917005059721589e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.206349e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.132456e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.536038e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.476431e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.506637e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.825751e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.900315e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.688447e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.486873e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.845505e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.921916e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.878507e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.466467e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.760833e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.235205e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.514420e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
index bebebe43ae..79dba98821 100644
--- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
-
 
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:55:13
+DATE: 2025-10-11_17:59:12
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 2625 events (found 5368 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8496s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8074s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8640s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8203s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0438s for     8192 events => throughput is 1.87E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4529s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4105s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0424s for     8192 events => throughput is 1.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4586s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4147s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0440s for     8192 events => throughput is 1.86E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641911695846964] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4621s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4167s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0450s for     8192 events => throughput is 1.82E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4711s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4252s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0455s for     8192 events => throughput is 1.80E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846964) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846964) differ by less than 3E-14 (4.440892098500626e-16)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.859940e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.837387e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.839978e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.822913e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4403s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4153s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0246s for     8192 events => throughput is 3.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4480s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4218s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0258s for     8192 events => throughput is 3.17E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846957) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846957) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.243144e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.267707e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.273347e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.222778e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4324s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4161s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0159s for     8192 events => throughput is 5.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4349s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4186s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0159s for     8192 events => throughput is 5.17E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.210364e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.198106e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.310117e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.028037e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4350s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4201s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0146s for     8192 events => throughput is 5.62E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4391s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4230s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0156s for     8192 events => throughput is 5.24E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.704117e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.463972e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.793092e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.474487e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4434s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4195s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0234s for     8192 events => throughput is 3.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4521s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4278s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0239s for     8192 events => throughput is 3.42E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.552376e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.505694e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.639783e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.538808e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -334,58 +341,58 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8650s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8612s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.66E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
+ [COUNTERS] PROGRAM TOTAL          :    0.8667s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8617s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.15E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0042s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cuda (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (44.641911695846943) and cuda (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.043338e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.923790e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.325784e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.174225e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.871559e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.777101e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.143094e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.655868e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.865333e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.765814e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.020534e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.993174e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.868423e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.751468e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.708181e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.413877e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
index 2a76a737ac..5dfa48ff39 100644
--- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
 
-
 make USEBUILDDIR=1 BACKEND=cppsse4
-make USEBUILDDIR=1 BACKEND=cppavx2
 
+make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:55:42
+DATE: 2025-10-11_17:59:42
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 2625 events (found 5368 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8397s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7971s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0425s for     8192 events => throughput is 1.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8523s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8088s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0435s for     8192 events => throughput is 1.88E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4553s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4122s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0430s for     8192 events => throughput is 1.90E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4551s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4119s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0433s for     8192 events => throughput is 1.89E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641906072918047] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4624s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4193s
+ [COUNTERS] PROGRAM TOTAL          :    0.4653s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4221s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0429s for     8192 events => throughput is 1.91E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641906072918047) differ by less than 4E-4 (1.2595627507661078e-07)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641906072918047) differ by less than 4E-4 (1.2595627474354387e-07)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.972969e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.918004e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.987350e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.936998e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641902189470080] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4356s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4182s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0171s for     8192 events => throughput is 4.78E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4377s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4199s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0176s for     8192 events => throughput is 4.66E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641902189470080) differ by less than 4E-4 (2.1294735186305758e-07)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641902189470080) differ by less than 4E-4 (2.1294735152999067e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.748983e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.699516e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.695429e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.722220e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -201,26 +208,26 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
  [COUNTERS] PROGRAM TOTAL          :    0.4310s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.4214s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0093s for     8192 events => throughput is 8.83E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0094s for     8192 events => throughput is 8.72E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761755559866e-07)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761733355405e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.169652e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.856695e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.239468e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.157334e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4221s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4132s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0086s for     8192 events => throughput is 9.50E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4281s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4187s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0091s for     8192 events => throughput is 8.96E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761755559866e-07)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761733355405e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.627165e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.452792e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.935546e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.496015e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641906399820272] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4293s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4169s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0121s for     8192 events => throughput is 6.75E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4332s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4204s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0126s for     8192 events => throughput is 6.52E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641906399820272) differ by less than 4E-4 (1.1863351012664225e-07)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641906399820272) differ by less than 4E-4 (1.1863350990459764e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.774461e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.751797e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.994273e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.843654e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641910992291372] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641911000118164] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8577s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8540s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.69E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.8690s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8644s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 1.06E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0039s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cuda (44.641910992291372) differ by less than 4E-4 (1.575997887748315e-08)
+OK! xsec from fortran (44.641911695846943) and cuda (44.641911000118164) differ by less than 4E-4 (1.5584654677880394e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.201092e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.158414e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.452650e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.781779e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.883185e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.387147e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.341479e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.660863e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.843740e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.340902e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.360831e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.882663e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.608054e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.999883e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.014740e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.181537e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
index 449e459bdc..4c27cac81e 100644
--- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
+make USEBUILDDIR=1 BACKEND=cppnone
 
 
-make USEBUILDDIR=1 BACKEND=cppnone
-make USEBUILDDIR=1 BACKEND=cppsse4
 
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:55:27
+DATE: 2025-10-11_17:59:27
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 2625 events (found 5368 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8469s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8046s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0423s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8565s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8130s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0434s for     8192 events => throughput is 1.89E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4519s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4098s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4587s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4147s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0440s for     8192 events => throughput is 1.86E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4618s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4159s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0455s for     8192 events => throughput is 1.80E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4690s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4218s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0468s for     8192 events => throughput is 1.75E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641912938404218) differ by less than 2E-4 (2.783387209603916e-08)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.833802e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.793421e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.834236e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.799600e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4439s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4188s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0248s for     8192 events => throughput is 3.30E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4483s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4223s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0256s for     8192 events => throughput is 3.20E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641912938404218) differ by less than 2E-4 (2.783387209603916e-08)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.367073e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.273502e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.340820e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.281864e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4302s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4144s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0154s for     8192 events => throughput is 5.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4382s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4219s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0159s for     8192 events => throughput is 5.17E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.283261e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.329657e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.353744e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.307405e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4297s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4151s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0143s for     8192 events => throughput is 5.75E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4397s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4242s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0151s for     8192 events => throughput is 5.42E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.825518e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.584798e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.928231e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.705746e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4383s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4159s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0220s for     8192 events => throughput is 3.73E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4435s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4205s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0227s for     8192 events => throughput is 3.61E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.615578e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.605692e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.732261e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.652839e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911674225568] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641912949951454] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8598s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8560s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.69E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
+ [COUNTERS] PROGRAM TOTAL          :    0.8669s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8620s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.17E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0042s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cuda (44.641911674225568) differ by less than 2E-4 (4.843293543999039e-10)
+OK! xsec from fortran (44.641911695846943) and cuda (44.641912949951454) differ by less than 2E-4 (2.809253607516382e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.907482e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.727760e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.361691e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.049471e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.875077e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.736425e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.567905e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.634947e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.865156e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.745425e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.911973e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.997146e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.881287e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.718374e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.733673e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.415073e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh
index 56fbce5d92..3c16230360 100755
--- a/epochX/cudacpp/tmad/madX.sh
+++ b/epochX/cudacpp/tmad/madX.sh
@@ -253,7 +253,7 @@ function getgridmax()
   elif [ "${ggttg}" == "1" ]; then
     echo 16384 32 # same total grid dimension as 2048 256
   elif [ "${ggttgg}" == "1" ]; then
-    echo 16384 32 # same total grid dimension as 2048 256
+    echo 512 32 # same total grid dimension as 64 256 (new sep2025: even 1024/32 aborts in max8thr mode)
   elif [ "${ggttggg}" == "1" ]; then
     echo 512 32 # same total grid dimension as 64 256
   elif [ "${gguu}" == "1" ]; then
@@ -478,9 +478,15 @@ function runmadevent()
 # PART 1 - build madevent
 ##########################################################################
 
+echo MADGRAPH_CUDA_ARCHITECTURE=${MADGRAPH_CUDA_ARCHITECTURE}
+echo MADGRAPH_HIP_ARCHITECTURE=${MADGRAPH_HIP_ARCHITECTURE}
+
 unset GTEST_ROOT
 unset LOCALGTEST
 
+export HASBLAS=hasBlas
+echo HASBLAS=${HASBLAS}
+
 for suff in $suffs; do
 
   dir=$(showdir)
@@ -511,6 +517,12 @@ if [ "${maketype}" == "-makeonly" ]; then printf "\nMAKE COMPLETED\n"; exit 0; f
 # PART 2 - run madevent
 ##########################################################################
 
+unset CUDACPP_RUNTIME_BLASCOLORSUM
+printf "\nCUDACPP_RUNTIME_BLASCOLORSUM=$CUDACPP_RUNTIME_BLASCOLORSUM\n"
+
+unset CUDACPP_RUNTIME_CUBLASTF32TENSOR
+printf "\nCUDACPP_RUNTIME_CUBLASTF32TENSOR=$CUDACPP_RUNTIME_CUBLASTF32TENSOR\n"
+
 printf "\nOMP_NUM_THREADS=$OMP_NUM_THREADS\n"
 
 printf "\nDATE: $(date '+%Y-%m-%d_%H:%M:%S')\n\n"
diff --git a/epochX/cudacpp/tmad/strip10x.sh b/epochX/cudacpp/tmad/strip10x.sh
new file mode 100755
index 0000000000..571d134a64
--- /dev/null
+++ b/epochX/cudacpp/tmad/strip10x.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+cd $(dirname $0)
+for log in logs_*/log*.txt ; do
+  cat $log | awk 'BEGIN{ok=1}; /^\*\*\*/{if ($5=="x10") ok=0; else ok=1}; {if (ok==1) print $0}' > ${log}.new
+  mv ${log}.new ${log}
+done
diff --git a/epochX/cudacpp/tput/allTees.sh b/epochX/cudacpp/tput/allTees.sh
index 69ef153764..8475b8fd1b 100755
--- a/epochX/cudacpp/tput/allTees.sh
+++ b/epochX/cudacpp/tput/allTees.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Apr 2022) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 scrdir=$(cd $(dirname $0); pwd)
 
@@ -20,7 +20,7 @@ if [ "$(hostname)" == "itgold91.cern.ch" ]; then bblds=-cpponly; fi
 # Usage
 function usage()
 {
-  echo "Usage (1): $0 [-short] [-e] [-sa] [-makeonly] [-nomakeclean] [-hip|-nocuda|-cpponly] [-bsmonly|-nobsm]"
+  echo "Usage (1): $0 [-short] [-e] [-sa] [-makeonly] [-nomakeclean] [-hip|-nocuda|-cpponly] [-bsmonly|-nobsm|-scalingonly|-blasonly|-blasandscalingonly]"
   echo "Run tests and check all logs"
   echo ""
   echo "Usage (2): $0 -checkonly"
@@ -32,7 +32,10 @@ function usage()
 checkonly=0
 ggttggg=-ggttggg
 rndhst=-curhst
-bsm=
+sm=1
+bsm=1
+scaling=1
+blas=1
 if [ "$1" == "-checkonly" ]; then
   # Check existing logs without running any tests?
   checkonly=1
@@ -73,11 +76,35 @@ while [ "${checkonly}" == "0" ] && [ "$1" != "" ]; do
     if [ "${bblds}" != "" ] && [ "${bblds}" != "$1" ]; then echo "ERROR! Incompatible option $1: backend builds are already defined as '$bblds'"; usage; fi
     bblds="$1"
     shift
-  elif [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then
-    bsm=$1
+  elif [ "$1" == "-bsmonly" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then
+    sm=0
+    bsm=1
+    scaling=0
+    blas=0
     shift
-  elif [ "$1" == "-nobsm" ] && [ "$bsm" != "-bsmonly" ]; then
-    bsm=$1
+  elif [ "$1" == "-nobsm" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then
+    sm=1
+    bsm=0
+    scaling=1
+    blas=1
+    shift
+  elif [ "$1" == "-scalingonly" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then
+    sm=0
+    bsm=0
+    scaling=1
+    blas=0
+    shift
+  elif [ "$1" == "-blasonly" ] && [ "${blas}${scaling}${bsm}${blas}" == "1111" ]; then
+    sm=0
+    bsm=0
+    scaling=0
+    blas=1
+    shift
+  elif [ "$1" == "-blasandscalingonly" ] && [ "${blas}${scaling}${bsm}${blas}" == "1111" ]; then
+    sm=0
+    bsm=0
+    scaling=1
+    blas=1
     shift
   else
     usage
@@ -88,11 +115,28 @@ done
 function checklogs()
 {
   cd $scrdir/..
-  # Print out any errors in the logs
-  if ! egrep -i '(error|fault|failed)' ./tput/logs_* -r; then echo "No errors found in logs"; fi
+  # Print out any errors in the logs (exclude scaling logs)
+  if ! egrep -i '(error|fault|failed)' ./tput/logs_*/*.txt; then echo "No errors found in logs"; fi
   # Print out any FPEs or '{ }' in the logs
   echo
   if ! egrep '(^Floating Point Exception|{ })' tput/logs* -r; then echo "No FPEs or '{ }' found in logs"; fi
+  # Print out any aborts in the logs (exclude scaling logs)
+  echo
+  txt=$(grep Abort ./tput/logs_*/*.txt | sed "s|\:.*SubProcesses/P|: P|")
+  if [ "${txt}" == "" ]; then
+    echo "No aborts found in logs"
+  else
+    echo "${txt}"
+  fi
+  # Print out any asserts/aborts in scaling logs
+  echo
+  txt=$(egrep -i '(abort|assert)' ./tput/logs_*/*.scaling | sed "s|\:.*SubProcesses/P|: P|" | sort -u)
+  if [ "${txt}" == "" ]; then
+    echo "No aborts or asserts found in scaling logs"
+  else
+    echo "${txt}"
+  fi
+
   # Print out the MEK channelid debugging output (except for '{ }')
   echo
   \grep MEK ${scrdir}/logs_*/* | sed "s|${scrdir}/logs_||" | grep -v '{ }' | sed 's|_mad.*DEBUG:||' | sort -u
@@ -123,11 +167,11 @@ fi
 cd $scrdir/..
 started="STARTED  AT $(date)"
 
-# (36/102) Six logs (double/mixed/float x hrd0/hrd1 x inl0) in each of the six SM processes
+# (+36: 36/144) Six logs (double/mixed/float x hrd0/hrd1 x inl0) in each of the six SM processes [sm==1]
 \rm -rf gg_ttggg${suff}/lib/build.none_*
 cmd="./tput/teeThroughputX.sh -dmf -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg ${makeclean} ${opts}"
 tmp1=$(mktemp)
-if [ "${bsm}" != "-bsmonly" ]; then
+if [ "${sm}" == "1" ]; then
   $cmd; status=$?
   ls -ltr ee_mumu${suff}/lib/build.none_*_inl0_hrd* gg_tt${suff}/lib/build.none_*_inl0_hrd* gg_tt*g${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp1
 else
@@ -135,86 +179,140 @@ else
 fi
 ended1="$cmd\nENDED(1) AT $(date) [Status=$status]"
 
-# (48/102) Four extra logs (double/float x hrd0/hrd1 x inl1) only in three of the six SM processes
+# (+18: 54/144) Three scaling logs (double/mixed/float x hrd0 x inl0) in each of the six SM processes [scaling==1]
+if [ "${scaling}" == "1" ]; then
+  if [ "${sm}" == "1" ]; then
+    cmd="./tput/teeThroughputX.sh -dmf -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -scaling ${opts}" # no rebuild needed
+    $cmd; status=$?
+  else
+    cmd="./tput/teeThroughputX.sh -dmf -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -scaling ${makeclean} ${opts}" # this is the first build
+    $cmd; status=$?
+  fi
+else
+  cmd="SKIP '$cmd'"; echo $cmd; status=$?
+fi
+ended1sc="$cmd\nENDED(1-scaling) AT $(date) [Status=$status]"
+
+# (+6: 60/144) Three extra logs (double/mixed/float x hrd0 x inl0 + blasOn) only in two of the six SM processes (rebuild may be needed) [blas==1]
+if [ "${blas}" == "1" ]; then
+  if [ "${sm}" == "1" ] || [ "${scaling}" == "1" ]; then
+    cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -blasOn ${opts}" # no rebuild needed
+    $cmd; status=$?
+  else
+    cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -blasOn ${makeclean} ${opts}" # this is the first build
+    $cmd; status=$?
+  fi
+else
+  cmd="SKIP '$cmd'"; echo $cmd; status=$?
+fi
+ended2="$cmd\nENDED(2) AT $(date) [Status=$status]"
+
+# (+12: 72/144) Three scaling logs (double/mixed/float x hrd0 x inl0 + blasOn) only in four of the six SM processes [blas==1 || scaling==1]
+if [ "${blas}" == "1" ] || [ "${scaling}" == "1" ]; then
+  cmd="./tput/teeThroughputX.sh -ggtt -ggttg -ggttgg -ggttggg -dmf -blasOn -scaling ${opts}" # no rebuild needed
+  $cmd; status=$?
+else
+  cmd="SKIP '$cmd'"; echo $cmd; status=$?
+fi
+ended2sc="$cmd\nENDED(2-scaling) AT $(date) [Status=$status]"
+
+# (+12: 84/144) Four extra logs (double/float x hrd0/hrd1 x inl1) only in three of the six SM processes [sm==1]
 \rm -rf gg_ttg${suff}/lib/build.none_*
 \rm -rf gg_ttggg${suff}/lib/build.none_*
 cmd="./tput/teeThroughputX.sh -d_f -hrd -makej -eemumu -ggtt -ggttgg -inlonly ${makeclean} ${opts}"
-tmp2=$(mktemp)
-if [ "${bsm}" != "-bsmonly" ]; then
+tmp3=$(mktemp)
+if [ "${sm}" == "1" ]; then
   $cmd; status=$?
-  ls -ltr ee_mumu${suff}/lib/build.none_*_inl1_hrd* gg_tt${suff}/lib/build.none_*_inl1_hrd* gg_tt*g${suff}/lib/build.none_*_inl1_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp2
+  ls -ltr ee_mumu${suff}/lib/build.none_*_inl1_hrd* gg_tt${suff}/lib/build.none_*_inl1_hrd* gg_tt*g${suff}/lib/build.none_*_inl1_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp3
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended2="$cmd\nENDED(2) AT $(date) [Status=$status]"
+ended3="$cmd\nENDED(3) AT $(date) [Status=$status]"
 
-# (60/102) Two extra logs (double/float x hrd0 x inl0 + bridge) in all six SM processes (rebuild from cache)
+# (+12: 96/144) Two extra logs (double/float x hrd0 x inl0 + bridge) in all six SM processes (rebuild from cache) [sm==1]
 cmd="./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg $ggttggg -d_f -bridge ${makeclean} ${opts}"
-if [ "${bsm}" != "-bsmonly" ]; then
+if [ "${sm}" == "1" ]; then
   $cmd; status=$?
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended3="$cmd\nENDED(3) AT $(date) [Status=$status]"
+ended4="$cmd\nENDED(4) AT $(date) [Status=$status]"
 
-# (66/102) Two extra logs (double/float x hrd0 x inl0 + rmbhst) only in three of the six SM processes (no rebuild needed)
+# (+6: 102/144) Two extra logs (double/float x hrd0 x inl0 + rmbhst) only in three of the six SM processes (no rebuild needed) [sm==1]
 cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f -rmbhst ${opts}"
-if [ "${bsm}" != "-bsmonly" ]; then
+if [ "${sm}" == "1" ]; then
   $cmd; status=$?
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended4="$cmd\nENDED(4) AT $(date) [Status=$status]"
+ended5="$cmd\nENDED(5) AT $(date) [Status=$status]"
 
-# (72/102) Two extra logs (double/float x hrd0 x inl0 + rndhst) only in three of the six SM processes (no rebuild needed)
+# (+6: 108/144) Two extra logs (double/float x hrd0 x inl0 + rndhst) only in three of the six SM processes (no rebuild needed) [sm==1]
 cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f ${rndhst} ${opts}"
-if [ "${bsm}" != "-bsmonly" ] && [ "${rndhst}" != "-common" ]; then
+if [ "${sm}" == "1" ] && [ "${rndhst}" != "-common" ]; then
   $cmd; status=$?
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended5="$cmd\nENDED(5) AT $(date) [Status=$status]"
+ended6="$cmd\nENDED(6) AT $(date) [Status=$status]"
 
-# (78/102) Two extra logs (double/float x hrd0 x inl0 + common) only in three of the six SM processes (no rebuild needed)
+# (+6: 114/144) Two extra logs (double/float x hrd0 x inl0 + common) only in three of the six SM processes (no rebuild needed) [sm==1]
 cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f -common ${opts}"
-if [ "${bsm}" != "-bsmonly" ]; then
+if [ "${sm}" == "1" ]; then
   $cmd; status=$?
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended6="$cmd\nENDED(6) AT $(date) [Status=$status]"
+ended7="$cmd\nENDED(7) AT $(date) [Status=$status]"
 
-# (102/102) Six extra logs (double/mixed/float x hrd0/hrd1 x inl0) only in the four BSM processes
+# (+6: 120/144) Three extra logs (double/float x hrd0 x inl0 + noBlas) only in two of the six SM processes (rebuild is needed) [blas==1]
+cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -noBlas ${makeclean} ${opts}"
+if [ "${blas}" == "1" ]; then
+  $cmd; status=$?
+else
+  cmd="SKIP '$cmd'"; echo $cmd; status=$?
+fi
+ended8="$cmd\nENDED(8) AT $(date) [Status=$status]"
+
+# (+24: 144/144) Six extra logs (double/mixed/float x hrd0/hrd1 x inl0) only in the four BSM processes [bsm==1]
 cmd="./tput/teeThroughputX.sh -dmf -hrd -makej -susyggtt -susyggt1t1 -smeftggtttt -heftggbb ${makeclean} ${opts}"
-tmp3=$(mktemp)
-if [ "${bsm}" != "-nobsm" ]; then
+tmp9=$(mktemp)
+if [ "${bsm}" == "1" ]; then
   $cmd; status=$?
-  ls -ltr susy_gg_tt${suff}/lib/build.none_*_inl0_hrd* susy_gg_t1t1${suff}/lib/build.none_*_inl0_hrd* smeft_gg_tttt${suff}/lib/build.none_*_inl0_hrd* heft_gg_bb${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp2
+  ls -ltr susy_gg_tt${suff}/lib/build.none_*_inl0_hrd* susy_gg_t1t1${suff}/lib/build.none_*_inl0_hrd* smeft_gg_tttt${suff}/lib/build.none_*_inl0_hrd* heft_gg_bb${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp9
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended7="$cmd\nENDED(7) AT $(date) [Status=$status]"
+ended9="$cmd\nENDED(9) AT $(date) [Status=$status]"
 
 echo
 echo "Build(1):"
 cat $tmp1
 echo
-echo "Build(2):"
-cat $tmp2
+echo "Build(3):"
+cat $tmp3
+echo
+echo "Build(9):"
+cat $tmp9
 echo
 echo -e "$started"
 echo -e "$ended1"
+echo -e "$ended1sc"
 echo -e "$ended2"
+echo -e "$ended2sc"
 echo -e "$ended3"
 echo -e "$ended4"
 echo -e "$ended5"
 echo -e "$ended6"
 echo -e "$ended7"
+echo -e "$ended8"
+echo -e "$ended9"
 
 if [ "$ggttggg" == "" ]; then
   echo
   echo "To complete the test for ggttggg type:"
   echo "  ./tput/teeThroughputX.sh -dmf -hrd -makej -ggttggg ${makeclean} ${opts}"
+  echo "  ./tput/teeThroughputX.sh -dmf -makej -ggttggg -scaling ${makeclean} ${opts}"
   echo "  ./tput/teeThroughputX.sh -makej -ggttggg -d_f -bridge ${makeclean} ${opts}"
 fi
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..1608b91cb1
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+DATE: 2025-10-11_15:39:36
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.365880e+06    1 256
+4.932658e+06    2 256
+1.130330e+07    4 256
+2.221065e+07    8 256
+3.796917e+07   16 256
+8.093742e+07   32 256
+1.438543e+08   64 256
+2.092652e+08  128 256
+2.586706e+08  256 256
+3.166572e+08  512 256
+3.450925e+08 1024 256
+### GPU: scaling test 32
+3.615411e+05    1  32
+7.956340e+05    2  32
+1.534533e+06    4  32
+2.896550e+06    8  32
+5.416499e+06   16  32
+1.086184e+07   32  32
+2.239377e+07   64  32
+4.040723e+07  128  32
+8.109125e+07  256  32
+1.501315e+08  512  32
+2.161406e+08 1024  32
+2.736516e+08 2048  32
+3.294400e+08 4096  32
+3.666924e+08 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.112163e+06    1 256
+1.095778e+06    2 256
+1.085622e+06    4 256
+### CPU: scaling test 32
+9.838283e+05    1  32
+1.009336e+06    2  32
+1.104848e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.791676e+06    1 256
+1.843126e+06    2 256
+1.850216e+06    4 256
+### CPU: scaling test 32
+1.835283e+06    1  32
+1.487162e+06    2  32
+1.478777e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.691677e+06    1 256
+2.725347e+06    2 256
+2.679688e+06    4 256
+### CPU: scaling test 32
+2.224230e+06    1  32
+2.558465e+06    2  32
+2.649774e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.781551e+06    1 256
+2.448941e+06    2 256
+2.756282e+06    4 256
+### CPU: scaling test 32
+2.377238e+06    1  32
+2.626719e+06    2  32
+2.722014e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.040101e+06    1 256
+2.059277e+06    2 256
+2.194331e+06    4 256
+### CPU: scaling test 32
+1.410251e+06    1  32
+1.626347e+06    2  32
+1.877466e+06    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index 2396150f34..6b63860e97 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:54:52
+DATE: 2025-10-11_15:13:43
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.715157e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.495446e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.756115e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.456825e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.020579e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.872827e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.722104 sec
-INFO: No Floating Point Exceptions have been reported
-     2,722,047,064      cycles                           #    2.855 GHz                    
-     4,240,638,296      instructions                     #    1.56  insn per cycle         
-       1.034081868 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.693291 sec
+     2,729,119,040      cycles                           #    2.827 GHz                       
+     4,039,185,150      instructions                     #    1.48  insn per cycle            
+       1.043410313 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.013288e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.182482e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.182482e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.019940e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.187870e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.187870e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.662912 sec
-INFO: No Floating Point Exceptions have been reported
-    19,208,633,801      cycles                           #    2.880 GHz                    
-    46,193,026,925      instructions                     #    2.40  insn per cycle         
-       6.677929994 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.588033 sec
+    19,038,044,386      cycles                           #    2.888 GHz                       
+    46,485,585,356      instructions                     #    2.44  insn per cycle            
+       6.596061286 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  482) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.534189e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.004053e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.004053e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.557129e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.030035e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.030035e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.563122 sec
-INFO: No Floating Point Exceptions have been reported
-    13,135,626,695      cycles                           #    2.874 GHz                    
-    31,728,680,952      instructions                     #    2.42  insn per cycle         
-       4.573724377 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.460811 sec
+    12,939,620,485      cycles                           #    2.898 GHz                       
+    31,810,901,247      instructions                     #    2.46  insn per cycle            
+       4.469139042 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.938790e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.711147e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.711147e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.933537e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.681631e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.681631e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.694529 sec
-INFO: No Floating Point Exceptions have been reported
-    10,256,024,954      cycles                           #    2.769 GHz                    
-    19,694,743,800      instructions                     #    1.92  insn per cycle         
-       3.707450749 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1909) (512y:    0) (512z:    0)
+TOTAL       :     3.671840 sec
+    10,104,892,452      cycles                           #    2.749 GHz                       
+    19,727,697,375      instructions                     #    1.95  insn per cycle            
+       3.679095535 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1917) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.944800e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.743029e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.743029e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.989488e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.781185e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.781185e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.690433 sec
-INFO: No Floating Point Exceptions have been reported
-    10,133,821,420      cycles                           #    2.743 GHz                    
-    19,357,887,145      instructions                     #    1.91  insn per cycle         
-       3.703105135 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1647) (512y:  180) (512z:    0)
+TOTAL       :     3.576826 sec
+     9,900,381,139      cycles                           #    2.765 GHz                       
+    19,380,047,753      instructions                     #    1.96  insn per cycle            
+       3.585735108 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1655) (512y:  180) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.663763e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.201339e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.201339e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.671348e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.193135e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.193135e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.240790 sec
-INFO: No Floating Point Exceptions have been reported
-     8,791,817,571      cycles                           #    2.072 GHz                    
-    15,864,118,825      instructions                     #    1.80  insn per cycle         
-       4.252718180 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  871) (512y:  156) (512z: 1258)
+TOTAL       :     4.184170 sec
+     8,626,596,296      cycles                           #    2.060 GHz                       
+    15,802,085,882      instructions                     #    1.83  insn per cycle            
+       4.189889070 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  873) (512y:  156) (512z: 1263)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
index 97960252e7..7af659d91e 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,252 +10,216 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:36:32
+DATE: 2025-10-11_16:27:21
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.729675e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.983590e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.983590e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.684743e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.912007e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.912007e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.228883 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     7,241,513,211      cycles                           #    2.923 GHz                    
-    12,978,693,777      instructions                     #    1.79  insn per cycle         
-       2.533005072 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
+TOTAL       :     2.246839 sec
+     7,225,562,469      cycles                           #    2.863 GHz                       
+    12,863,341,750      instructions                     #    1.78  insn per cycle            
+       2.580507454 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.954014e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154803e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.154803e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.838576e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.140129e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.140129e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.972350 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    20,384,148,235      cycles                           #    2.919 GHz                    
-    46,410,615,309      instructions                     #    2.28  insn per cycle         
-       6.984536194 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.023062 sec
+    20,241,810,963      cycles                           #    2.880 GHz                       
+    46,692,050,581      instructions                     #    2.31  insn per cycle            
+       7.030271965 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  482) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.493408e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.921090e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.921090e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.470152e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.890657e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.890657e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.877492 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    14,402,886,877      cycles                           #    2.946 GHz                    
-    32,567,021,239      instructions                     #    2.26  insn per cycle         
-       4.890045852 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.909808 sec
+    14,179,876,666      cycles                           #    2.885 GHz                       
+    32,595,242,292      instructions                     #    2.30  insn per cycle            
+       4.916954834 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.864025e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.539449e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.539449e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.819567e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.481129e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.481129e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.048395 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    11,503,225,226      cycles                           #    2.834 GHz                    
-    21,048,377,803      instructions                     #    1.83  insn per cycle         
-       4.060868426 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1909) (512y:    0) (512z:    0)
+TOTAL       :     4.095092 sec
+    11,322,720,907      cycles                           #    2.761 GHz                       
+    21,029,920,385      instructions                     #    1.86  insn per cycle            
+       4.102381100 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1917) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.889652e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.596697e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.596697e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.870930e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.557290e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.557290e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.001389 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    11,334,121,636      cycles                           #    2.824 GHz                    
-    20,717,870,984      instructions                     #    1.83  insn per cycle         
-       4.014529771 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1647) (512y:  180) (512z:    0)
+TOTAL       :     3.995093 sec
+    11,100,469,150      cycles                           #    2.774 GHz                       
+    20,681,913,151      instructions                     #    1.86  insn per cycle            
+       4.002396442 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1655) (512y:  180) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.585647e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.044820e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.044820e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.582678e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.044225e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.044225e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.655129 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    10,329,600,614      cycles                           #    2.214 GHz                    
-    17,028,538,054      instructions                     #    1.65  insn per cycle         
-       4.667149794 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  871) (512y:  156) (512z: 1258)
+TOTAL       :     4.613845 sec
+     9,931,301,323      cycles                           #    2.150 GHz                       
+    16,893,944,858      instructions                     #    1.70  insn per cycle            
+       4.620613606 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  873) (512y:  156) (512z: 1263)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
index a07615eec8..26a3ddb0c7 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:48:44
+DATE: 2025-10-11_16:42:49
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.479194e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.613891e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.774308e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.197440e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.038954e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.882278e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     1.350685 sec
-INFO: No Floating Point Exceptions have been reported
-     4,619,154,070      cycles                           #    2.910 GHz                    
-     7,244,933,472      instructions                     #    1.57  insn per cycle         
-       1.645096659 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     1.377431 sec
+     4,700,779,648      cycles                           #    2.862 GHz                       
+     7,103,932,908      instructions                     #    1.51  insn per cycle            
+       1.699431401 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.031231e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.202853e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.202853e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.015955e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.183181e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.183181e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     6.892564 sec
-INFO: No Floating Point Exceptions have been reported
-    20,216,212,113      cycles                           #    2.933 GHz                    
-    46,211,289,901      instructions                     #    2.29  insn per cycle         
-       6.898049528 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.982657 sec
+    20,123,225,872      cycles                           #    2.880 GHz                       
+    46,589,016,073      instructions                     #    2.32  insn per cycle            
+       6.988225439 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  482) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.575355e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.054940e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.054940e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.538846e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.003610e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.003610e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.814476 sec
-INFO: No Floating Point Exceptions have been reported
-    14,161,512,947      cycles                           #    2.938 GHz                    
-    31,718,115,030      instructions                     #    2.24  insn per cycle         
-       4.820285845 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.882603 sec
+    14,026,556,551      cycles                           #    2.870 GHz                       
+    31,813,873,682      instructions                     #    2.27  insn per cycle            
+       4.888198902 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.990481e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.780031e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.780031e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.898151e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.633048e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.633048e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.993697 sec
-INFO: No Floating Point Exceptions have been reported
-    11,344,220,574      cycles                           #    2.837 GHz                    
-    19,628,934,109      instructions                     #    1.73  insn per cycle         
-       3.999571252 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1909) (512y:    0) (512z:    0)
+TOTAL       :     4.110798 sec
+    11,260,535,150      cycles                           #    2.739 GHz                       
+    19,633,224,823      instructions                     #    1.74  insn per cycle            
+       4.116583823 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1917) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.024448e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.841239e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.841239e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.970956e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.746513e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.746513e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.944999 sec
-INFO: No Floating Point Exceptions have been reported
-    11,153,243,188      cycles                           #    2.824 GHz                    
-    19,098,861,484      instructions                     #    1.71  insn per cycle         
-       3.950731996 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1647) (512y:  180) (512z:    0)
+TOTAL       :     3.988212 sec
+    10,998,193,863      cycles                           #    2.755 GHz                       
+    19,082,144,667      instructions                     #    1.74  insn per cycle            
+       3.993745104 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1655) (512y:  180) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.731970e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.289397e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.289397e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.672146e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.193639e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.193639e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.507668 sec
-INFO: No Floating Point Exceptions have been reported
-     9,996,448,485      cycles                           #    2.215 GHz                    
-    15,693,646,767      instructions                     #    1.57  insn per cycle         
-       4.513790217 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  871) (512y:  156) (512z: 1258)
+TOTAL       :     4.562173 sec
+     9,723,899,863      cycles                           #    2.130 GHz                       
+    15,503,539,741      instructions                     #    1.59  insn per cycle            
+       4.567607097 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  873) (512y:  156) (512z: 1263)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
index cf4e1a1e41..6fb7bec229 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:45:58
+DATE: 2025-10-11_16:39:22
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.516686e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.553796e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.802555e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.211048e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.057687e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.886821e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.994565 sec
-INFO: No Floating Point Exceptions have been reported
-     3,557,200,491      cycles                           #    2.898 GHz                    
-     7,056,373,361      instructions                     #    1.98  insn per cycle         
-       1.285636058 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     1.007194 sec
+     3,630,386,848      cycles                           #    2.852 GHz                       
+     7,085,182,200      instructions                     #    1.95  insn per cycle            
+       1.329367848 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.036397e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.208868e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.208868e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.609025e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.108811e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.108811e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.480716 sec
-INFO: No Floating Point Exceptions have been reported
-    19,050,518,676      cycles                           #    2.938 GHz                    
-    46,087,808,907      instructions                     #    2.42  insn per cycle         
-       6.486425223 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.966326 sec
+    20,072,455,939      cycles                           #    2.880 GHz                       
+    46,487,974,788      instructions                     #    2.32  insn per cycle            
+       6.971901471 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  482) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.562645e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.044042e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.044042e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.534636e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.011512e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.011512e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.441615 sec
-INFO: No Floating Point Exceptions have been reported
-    13,100,732,544      cycles                           #    2.946 GHz                    
-    31,624,731,275      instructions                     #    2.41  insn per cycle         
-       4.447190414 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.522016 sec
+    13,022,549,779      cycles                           #    2.877 GHz                       
+    31,812,825,471      instructions                     #    2.44  insn per cycle            
+       4.527552219 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.962342e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.741135e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.741135e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.935285e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.687999e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.687999e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.617520 sec
-INFO: No Floating Point Exceptions have been reported
-    10,105,971,200      cycles                           #    2.790 GHz                    
-    19,587,417,861      instructions                     #    1.94  insn per cycle         
-       3.623303854 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1909) (512y:    0) (512z:    0)
+TOTAL       :     3.667443 sec
+    10,100,998,652      cycles                           #    2.751 GHz                       
+    19,728,236,183      instructions                     #    1.95  insn per cycle            
+       3.673057057 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1917) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.035108e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.854302e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.854302e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.992051e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.787343e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.787343e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.498153 sec
-INFO: No Floating Point Exceptions have been reported
-     9,879,352,969      cycles                           #    2.820 GHz                    
-    19,249,039,766      instructions                     #    1.95  insn per cycle         
-       3.504047287 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1647) (512y:  180) (512z:    0)
+TOTAL       :     3.571290 sec
+     9,885,962,165      cycles                           #    2.765 GHz                       
+    19,369,829,317      instructions                     #    1.96  insn per cycle            
+       3.576876880 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1655) (512y:  180) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.738426e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.300548e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.300548e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.693244e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.231997e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.231997e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.029997 sec
-INFO: No Floating Point Exceptions have been reported
-     8,617,786,478      cycles                           #    2.136 GHz                    
-    15,755,373,979      instructions                     #    1.83  insn per cycle         
-       4.035885525 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  871) (512y:  156) (512z: 1258)
+TOTAL       :     4.132357 sec
+     8,622,523,625      cycles                           #    2.084 GHz                       
+    15,800,710,236      instructions                     #    1.83  insn per cycle            
+       4.137999929 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  873) (512y:  156) (512z: 1263)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
index 23a95e9b43..93b11c3b79 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,235 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:43:10
+DATE: 2025-10-11_16:35:54
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.035607e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.566958e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.715605e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.941086e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.084749e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.895980e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.900996 sec
-INFO: No Floating Point Exceptions have been reported
-     6,141,367,935      cycles                           #    2.877 GHz                    
-    11,470,611,621      instructions                     #    1.87  insn per cycle         
-       2.190401749 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
+TOTAL       :     1.918291 sec
+     6,252,733,621      cycles                           #    2.863 GHz                       
+    11,379,391,021      instructions                     #    1.82  insn per cycle            
+       2.240220236 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.040250e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.212161e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.212161e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.013186e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.180354e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.180354e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.458941 sec
-INFO: No Floating Point Exceptions have been reported
-    19,062,791,283      cycles                           #    2.949 GHz                    
-    46,091,693,422      instructions                     #    2.42  insn per cycle         
-       6.464859061 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.629592 sec
+    19,062,117,259      cycles                           #    2.874 GHz                       
+    46,484,682,805      instructions                     #    2.44  insn per cycle            
+       6.635147352 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  482) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.576646e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.057103e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.057103e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.545386e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.014583e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.014583e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.406292 sec
-INFO: No Floating Point Exceptions have been reported
-    12,965,800,121      cycles                           #    2.939 GHz                    
-    31,623,980,844      instructions                     #    2.44  insn per cycle         
-       4.412202935 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.493129 sec
+    12,958,309,518      cycles                           #    2.881 GHz                       
+    31,813,104,162      instructions                     #    2.46  insn per cycle            
+       4.498775995 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.982815e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.782156e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.782156e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.912965e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.656557e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.656557e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.583602 sec
-INFO: No Floating Point Exceptions have been reported
-    10,107,254,042      cycles                           #    2.816 GHz                    
-    19,587,412,579      instructions                     #    1.94  insn per cycle         
-       3.589639966 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1909) (512y:    0) (512z:    0)
+TOTAL       :     3.707178 sec
+    10,138,189,210      cycles                           #    2.732 GHz                       
+    19,728,296,128      instructions                     #    1.95  insn per cycle            
+       3.712878607 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1917) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.036151e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.856576e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.856576e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.985253e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.770354e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.770354e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.497964 sec
-INFO: No Floating Point Exceptions have been reported
-     9,879,922,849      cycles                           #    2.820 GHz                    
-    19,260,007,955      instructions                     #    1.95  insn per cycle         
-       3.503929332 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1647) (512y:  180) (512z:    0)
+TOTAL       :     3.582064 sec
+     9,886,774,092      cycles                           #    2.757 GHz                       
+    19,370,169,431      instructions                     #    1.96  insn per cycle            
+       3.587619730 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1655) (512y:  180) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.741980e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.303561e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.303561e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.686193e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.230105e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.230105e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.020543 sec
-INFO: No Floating Point Exceptions have been reported
-     8,613,807,526      cycles                           #    2.140 GHz                    
-    15,755,294,312      instructions                     #    1.83  insn per cycle         
-       4.026429840 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  871) (512y:  156) (512z: 1258)
+TOTAL       :     4.149789 sec
+     8,677,655,368      cycles                           #    2.089 GHz                       
+    15,800,773,198      instructions                     #    1.82  insn per cycle            
+       4.155474285 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  873) (512y:  156) (512z: 1263)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
index 25ac5b33ed..0a4631bfc6 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:55:23
+DATE: 2025-10-11_15:14:20
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.275982e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.504846e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.746692e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.305792e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.022345e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.904091e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.698739 sec
-INFO: No Floating Point Exceptions have been reported
-     2,671,543,996      cycles                           #    2.868 GHz                    
-     4,201,680,962      instructions                     #    1.57  insn per cycle         
-       1.042000131 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.693566 sec
+     2,710,557,615      cycles                           #    2.827 GHz                       
+     4,083,363,883      instructions                     #    1.51  insn per cycle            
+       1.021549892 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.030289e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.210430e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.210430e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.017450e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.184170e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.184170e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.580678 sec
-INFO: No Floating Point Exceptions have been reported
-    19,388,414,039      cycles                           #    2.942 GHz                    
-    46,168,116,276      instructions                     #    2.38  insn per cycle         
-       6.592554583 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  452) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.603628 sec
+    19,045,137,786      cycles                           #    2.882 GHz                       
+    46,458,572,507      instructions                     #    2.44  insn per cycle            
+       6.609045751 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  474) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.571872e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.069657e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.069657e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.561588e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.042161e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.042161e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.454497 sec
-INFO: No Floating Point Exceptions have been reported
-    13,123,917,893      cycles                           #    2.941 GHz                    
-    31,665,954,915      instructions                     #    2.41  insn per cycle         
-       4.468095413 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1648) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.447754 sec
+    12,946,444,589      cycles                           #    2.908 GHz                       
+    31,786,052,376      instructions                     #    2.46  insn per cycle            
+       4.453579330 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1659) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.982748e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.777393e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.777393e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.943406e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.706594e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.706594e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.618164 sec
-INFO: No Floating Point Exceptions have been reported
-    10,210,665,805      cycles                           #    2.814 GHz                    
-    19,682,748,403      instructions                     #    1.93  insn per cycle         
-       3.629801888 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1894) (512y:    0) (512z:    0)
+TOTAL       :     3.652290 sec
+    10,144,241,352      cycles                           #    2.774 GHz                       
+    19,717,545,087      instructions                     #    1.94  insn per cycle            
+       3.657857806 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1902) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165090E-002
 Relative difference = 1.0277089176796747e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.010638e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.831487e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.831487e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.997101e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.794298e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.794298e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.575425 sec
-INFO: No Floating Point Exceptions have been reported
-    10,055,677,244      cycles                           #    2.805 GHz                    
-    19,379,411,405      instructions                     #    1.93  insn per cycle         
-       3.588891240 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1636) (512y:  178) (512z:    0)
+TOTAL       :     3.563735 sec
+     9,854,038,944      cycles                           #    2.762 GHz                       
+    19,385,201,008      instructions                     #    1.97  insn per cycle            
+       3.569441170 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1640) (512y:  180) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165090E-002
 Relative difference = 1.0277089176796747e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.768631e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.372427e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.372427e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.736214e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.301251e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.301251e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.003265 sec
-INFO: No Floating Point Exceptions have been reported
-     8,643,505,927      cycles                           #    2.154 GHz                    
-    15,697,303,734      instructions                     #    1.82  insn per cycle         
-       4.017112338 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  833) (512y:  153) (512z: 1240)
+TOTAL       :     4.039858 sec
+     8,445,670,568      cycles                           #    2.088 GHz                       
+    15,663,059,460      instructions                     #    1.85  insn per cycle            
+       4.045505615 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  845) (512y:  154) (512z: 1244)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
index 9d9181639f..9b568d27dc 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:26:55
+DATE: 2025-10-11_16:16:29
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.029061e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.569612e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.860356e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.176996e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.012495e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.891048e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.695056 sec
-INFO: No Floating Point Exceptions have been reported
-     2,704,879,803      cycles                           #    2.897 GHz                    
-     4,231,460,596      instructions                     #    1.56  insn per cycle         
-       0.994220648 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.694489 sec
+     2,721,882,133      cycles                           #    2.827 GHz                       
+     4,075,193,578      instructions                     #    1.50  insn per cycle            
+       1.025946647 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.606609e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.069672e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.069672e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.542747e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.967302e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.967302e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.367451 sec
-INFO: No Floating Point Exceptions have been reported
-    12,912,062,009      cycles                           #    2.950 GHz                    
-    32,678,927,799      instructions                     #    2.53  insn per cycle         
-       4.379017229 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  281) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.494551 sec
+    12,989,678,815      cycles                           #    2.889 GHz                       
+    32,646,175,174      instructions                     #    2.51  insn per cycle            
+       4.499744847 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  274) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.977635e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.819919e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.819919e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.896999e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.655930e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.655930e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.639596 sec
-INFO: No Floating Point Exceptions have been reported
-    10,716,876,159      cycles                           #    2.936 GHz                    
-    25,005,426,831      instructions                     #    2.33  insn per cycle         
-       3.651343591 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1246) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.740364 sec
+    10,735,813,544      cycles                           #    2.867 GHz                       
+    24,899,817,001      instructions                     #    2.32  insn per cycle            
+       3.745821170 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1252) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.209379e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.259757e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.259757e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.183902e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.196051e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.196051e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.300079 sec
-INFO: No Floating Point Exceptions have been reported
-     9,398,178,742      cycles                           #    2.838 GHz                    
-    16,938,114,674      instructions                     #    1.80  insn per cycle         
-       3.311853262 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1599) (512y:    0) (512z:    0)
+TOTAL       :     3.294762 sec
+     9,147,621,247      cycles                           #    2.773 GHz                       
+    16,945,065,636      instructions                     #    1.85  insn per cycle            
+       3.300349072 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1609) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039868165088E-002
-Relative difference = 1.0277089312025782e-08
+Avg ME (F77/C++)    = 1.2828039868165090E-002
+Relative difference = 1.0277089176796747e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.277311e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.397001e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.397001e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.267329e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.347814e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.347814e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.212263 sec
-INFO: No Floating Point Exceptions have been reported
-     9,139,009,296      cycles                           #    2.835 GHz                    
-    16,502,297,129      instructions                     #    1.81  insn per cycle         
-       3.223908096 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1355) (512y:  139) (512z:    0)
+TOTAL       :     3.186397 sec
+     8,854,475,202      cycles                           #    2.775 GHz                       
+    16,456,181,779      instructions                     #    1.86  insn per cycle            
+       3.191297678 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1359) (512y:  139) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039868165088E-002
-Relative difference = 1.0277089312025782e-08
+Avg ME (F77/C++)    = 1.2828039868165090E-002
+Relative difference = 1.0277089176796747e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.921368e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.661482e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.661482e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.906352e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.613901e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.613901e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.727052 sec
-INFO: No Floating Point Exceptions have been reported
-     8,146,634,535      cycles                           #    2.180 GHz                    
-    14,661,732,896      instructions                     #    1.80  insn per cycle         
-       3.738643291 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1003) (512y:  158) (512z:  946)
+TOTAL       :     3.717092 sec
+     7,920,630,909      cycles                           #    2.128 GHz                       
+    14,619,990,772      instructions                     #    1.85  insn per cycle            
+       3.722531495 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1004) (512y:  158) (512z:  960)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039868165088E-002
-Relative difference = 1.0277089312025782e-08
+Avg ME (F77/C++)    = 1.2828039868165090E-002
+Relative difference = 1.0277089176796747e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
index abe54e8953..e2fad0413c 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:27:22
+DATE: 2025-10-11_16:16:58
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.921706e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.715910e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.877358e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.326337e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.070850e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.905795e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.678632 sec
-INFO: No Floating Point Exceptions have been reported
-     2,636,898,249      cycles                           #    2.884 GHz                    
-     4,067,260,892      instructions                     #    1.54  insn per cycle         
-       0.973352356 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.687566 sec
+     2,696,565,159      cycles                           #    2.829 GHz                       
+     4,062,904,580      instructions                     #    1.51  insn per cycle            
+       1.010928380 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.084164e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.941928e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.941928e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.043775e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.849543e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.849543e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.469758 sec
-INFO: No Floating Point Exceptions have been reported
-    10,217,900,291      cycles                           #    2.936 GHz                    
-    25,614,437,724      instructions                     #    2.51  insn per cycle         
-       3.480862891 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  236) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.494605 sec
+    10,083,396,787      cycles                           #    2.882 GHz                       
+    25,760,449,217      instructions                     #    2.55  insn per cycle            
+       3.499888853 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  246) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.313032e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.558172e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.558172e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.297652e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.517332e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.517332e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.173341 sec
-INFO: No Floating Point Exceptions have been reported
-     9,354,473,123      cycles                           #    2.939 GHz                    
-    21,650,720,885      instructions                     #    2.31  insn per cycle         
-       3.184272296 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1112) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.161432 sec
+     9,089,198,091      cycles                           #    2.871 GHz                       
+    21,827,149,693      instructions                     #    2.40  insn per cycle            
+       3.166784889 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1116) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.358550e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.604458e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.604458e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.295786e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.454015e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.454015e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.115986 sec
-INFO: No Floating Point Exceptions have been reported
-     8,850,186,465      cycles                           #    2.831 GHz                    
-    16,062,849,181      instructions                     #    1.81  insn per cycle         
-       3.126797345 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1497) (512y:    0) (512z:    0)
+TOTAL       :     3.158774 sec
+     8,695,257,664      cycles                           #    2.749 GHz                       
+    15,965,615,823      instructions                     #    1.84  insn per cycle            
+       3.164128836 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1484) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.422935e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.724037e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.724037e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.398085e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.643924e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.643924e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.043301 sec
-INFO: No Floating Point Exceptions have been reported
-     8,651,791,606      cycles                           #    2.834 GHz                    
-    15,666,461,627      instructions                     #    1.81  insn per cycle         
-       3.054177777 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1264) (512y:  141) (512z:    0)
+TOTAL       :     3.034628 sec
+     8,440,163,243      cycles                           #    2.777 GHz                       
+    15,795,186,827      instructions                     #    1.87  insn per cycle            
+       3.039990401 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1288) (512y:  141) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.052275e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.908416e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.908416e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.002688e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.799181e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.799181e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.514529 sec
-INFO: No Floating Point Exceptions have been reported
-     7,791,531,975      cycles                           #    2.211 GHz                    
-    14,393,714,103      instructions                     #    1.85  insn per cycle         
-       3.525649878 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1031) (512y:  164) (512z:  876)
+TOTAL       :     3.557099 sec
+     7,607,771,698      cycles                           #    2.137 GHz                       
+    14,233,174,966      instructions                     #    1.87  insn per cycle            
+       3.562310738 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  992) (512y:  158) (512z:  880)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..a78c1b2deb
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+DATE: 2025-10-11_15:40:18
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.981251e+06    1 256
+6.047935e+06    2 256
+1.122832e+07    4 256
+2.252678e+07    8 256
+4.235605e+07   16 256
+8.416122e+07   32 256
+1.466169e+08   64 256
+3.049065e+08  128 256
+4.651176e+08  256 256
+6.085927e+08  512 256
+7.481343e+08 1024 256
+### GPU: scaling test 32
+4.108938e+05    1  32
+7.731896e+05    2  32
+1.472652e+06    4  32
+3.058688e+06    8  32
+4.923029e+06   16  32
+1.154805e+07   32  32
+2.237762e+07   64  32
+4.518229e+07  128  32
+7.698959e+07  256  32
+1.503754e+08  512  32
+2.942634e+08 1024  32
+4.027161e+08 2048  32
+5.199929e+08 4096  32
+5.853205e+08 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.083777e+06    1 256
+1.126195e+06    2 256
+1.126272e+06    4 256
+### CPU: scaling test 32
+1.086034e+06    1  32
+1.116071e+06    2  32
+1.128798e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.853894e+06    1 256
+3.152865e+06    2 256
+3.025871e+06    4 256
+### CPU: scaling test 32
+2.851034e+06    1  32
+2.925313e+06    2  32
+2.581790e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.276087e+06    1 256
+3.611916e+06    2 256
+3.183634e+06    4 256
+### CPU: scaling test 32
+3.073082e+06    1  32
+3.375349e+06    2  32
+2.927052e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.662480e+06    1 256
+3.408266e+06    2 256
+3.661694e+06    4 256
+### CPU: scaling test 32
+1.789109e+06    1  32
+3.449949e+06    2  32
+3.560402e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.254224e+06    1 256
+3.401880e+06    2 256
+3.536803e+06    4 256
+### CPU: scaling test 32
+1.684033e+06    1  32
+2.687382e+06    2  32
+2.916448e+06    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index fa697401ba..9dacd0443a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:56:56
+DATE: 2025-10-11_15:16:08
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.318402e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.547340e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.573294e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.223637e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.675161e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.645637e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.580539 sec
-INFO: No Floating Point Exceptions have been reported
-     2,318,735,379      cycles                           #    2.865 GHz                    
-     3,612,120,055      instructions                     #    1.56  insn per cycle         
-       0.879357898 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.588199 sec
+     2,408,587,167      cycles                           #    2.842 GHz                       
+     3,683,823,828      instructions                     #    1.53  insn per cycle            
+       0.903961148 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.072197e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.275533e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.275533e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.035251e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.217456e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.217456e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.274259 sec
-INFO: No Floating Point Exceptions have been reported
-    18,464,131,410      cycles                           #    2.940 GHz                    
-    45,058,020,075      instructions                     #    2.44  insn per cycle         
-       6.281329583 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.454566 sec
+    18,664,660,450      cycles                           #    2.890 GHz                       
+    45,251,843,843      instructions                     #    2.42  insn per cycle            
+       6.459911913 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.257463e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.446957e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.446957e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.213678e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.366853e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.366853e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.180369 sec
-INFO: No Floating Point Exceptions have been reported
-     9,372,467,471      cycles                           #    2.941 GHz                    
-    22,319,965,268      instructions                     #    2.38  insn per cycle         
-       3.189536232 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.221547 sec
+     9,347,928,391      cycles                           #    2.898 GHz                       
+    22,375,063,737      instructions                     #    2.39  insn per cycle            
+       3.226933374 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039280066150E-002
 Relative difference = 5.612189004572479e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.408379e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.710073e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.710073e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.361341e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.581474e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.581474e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.001033 sec
-INFO: No Floating Point Exceptions have been reported
-     8,493,792,111      cycles                           #    2.825 GHz                    
-    15,797,222,111      instructions                     #    1.86  insn per cycle         
-       3.010052254 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2565) (512y:    0) (512z:    0)
+TOTAL       :     3.041655 sec
+     8,385,705,935      cycles                           #    2.753 GHz                       
+    15,815,253,481      instructions                     #    1.89  insn per cycle            
+       3.046966557 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2575) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.426130e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.768067e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.768067e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.426573e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.714317e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.714317e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.984448 sec
-INFO: No Floating Point Exceptions have been reported
-     8,427,466,763      cycles                           #    2.818 GHz                    
-    15,640,000,146      instructions                     #    1.86  insn per cycle         
-       2.993491493 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2462) (512y:   12) (512z:    0)
+TOTAL       :     2.970277 sec
+     8,276,306,484      cycles                           #    2.782 GHz                       
+    15,653,687,115      instructions                     #    1.89  insn per cycle            
+       2.975610452 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2472) (512y:   10) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.427110e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.709739e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.709739e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.392250e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.619370e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.619370e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.984139 sec
-INFO: No Floating Point Exceptions have been reported
-     6,725,622,216      cycles                           #    2.249 GHz                    
-    12,910,486,373      instructions                     #    1.92  insn per cycle         
-       2.994013668 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:   16) (512z: 1440)
+TOTAL       :     3.010134 sec
+     6,663,148,382      cycles                           #    2.210 GHz                       
+    12,894,118,429      instructions                     #    1.94  insn per cycle            
+       3.015621591 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1701) (512y:    5) (512z: 1445)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828052585973637E-002
 Relative difference = 2.0158743040564767e-07
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
index 9136826931..215370ad38 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,252 +10,216 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:37:07
+DATE: 2025-10-11_16:28:03
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.256593e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.121486e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.121486e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.220206e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.249013e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.249013e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.691319 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     5,610,006,933      cycles                           #    2.911 GHz                    
-    10,218,919,767      instructions                     #    1.82  insn per cycle         
-       1.984436466 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
+TOTAL       :     1.704287 sec
+     5,590,644,626      cycles                           #    2.843 GHz                       
+    10,005,372,723      instructions                     #    1.79  insn per cycle            
+       2.022727811 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.060836e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.248384e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.248384e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.010617e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.186955e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.186955e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.418392 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    18,916,088,440      cycles                           #    2.945 GHz                    
-    45,156,650,630      instructions                     #    2.39  insn per cycle         
-       6.425565221 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.713335 sec
+    19,329,941,883      cycles                           #    2.877 GHz                       
+    45,365,505,516      instructions                     #    2.35  insn per cycle            
+       6.720261817 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.163234e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.223206e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.223206e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.128665e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.170237e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.170237e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.414716 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    10,073,193,872      cycles                           #    2.945 GHz                    
-    23,610,645,909      instructions                     #    2.34  insn per cycle         
-       3.421707000 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.459266 sec
+    10,015,354,665      cycles                           #    2.890 GHz                       
+    23,673,664,836      instructions                     #    2.36  insn per cycle            
+       3.466212345 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039280066150E-002
 Relative difference = 5.612189004572479e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.302389e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.467769e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.467769e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.263697e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.371457e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.371457e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.241454 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     9,215,037,610      cycles                           #    2.837 GHz                    
-    16,874,646,512      instructions                     #    1.83  insn per cycle         
-       3.248598680 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2565) (512y:    0) (512z:    0)
+TOTAL       :     3.286775 sec
+     9,106,177,679      cycles                           #    2.766 GHz                       
+    16,899,675,653      instructions                     #    1.86  insn per cycle            
+       3.293662887 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2575) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.316990e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.533576e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.533576e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.302738e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.462511e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.462511e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.224710 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     9,166,209,661      cycles                           #    2.837 GHz                    
-    16,710,284,997      instructions                     #    1.82  insn per cycle         
-       3.231713030 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2462) (512y:   12) (512z:    0)
+TOTAL       :     3.240690 sec
+     8,985,254,061      cycles                           #    2.768 GHz                       
+    16,737,997,718      instructions                     #    1.86  insn per cycle            
+       3.247472027 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2472) (512y:   10) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.333210e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.469405e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.469405e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.254993e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.321155e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.321155e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     3.205451 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     7,432,634,328      cycles                           #    2.315 GHz                    
-    14,074,642,515      instructions                     #    1.89  insn per cycle         
-       3.212353581 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:   16) (512z: 1440)
+TOTAL       :     3.302457 sec
+     7,458,897,279      cycles                           #    2.255 GHz                       
+    14,069,459,173      instructions                     #    1.89  insn per cycle            
+       3.309041869 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1701) (512y:    5) (512z: 1445)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828052585973637E-002
 Relative difference = 2.0158743040564767e-07
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
index 3c8228d85b..c35f97f2b8 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:49:18
+DATE: 2025-10-11_16:43:25
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.233592e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.244967e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.184868e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.253381e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.370790e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.518342e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371863e-02 +- 3.269951e-06 )  GeV^0
-TOTAL       :     1.220966 sec
-INFO: No Floating Point Exceptions have been reported
-     4,183,681,416      cycles                           #    2.867 GHz                    
-     6,662,508,205      instructions                     #    1.59  insn per cycle         
-       1.516447212 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     1.218481 sec
+     4,207,892,724      cycles                           #    2.859 GHz                       
+     6,617,854,340      instructions                     #    1.57  insn per cycle            
+       1.530363886 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.080178e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.275874e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.275874e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.036512e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.218588e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.218588e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270267e-06 )  GeV^0
-TOTAL       :     6.537773 sec
-INFO: No Floating Point Exceptions have been reported
-    19,269,764,932      cycles                           #    2.946 GHz                    
-    45,190,617,795      instructions                     #    2.35  insn per cycle         
-       6.543013626 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.791690 sec
+    19,679,660,217      cycles                           #    2.896 GHz                       
+    45,434,399,439      instructions                     #    2.31  insn per cycle            
+       6.797219573 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.263942e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.453881e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.453881e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.200562e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.338496e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.338496e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270266e-06 )  GeV^0
-TOTAL       :     3.487545 sec
-INFO: No Floating Point Exceptions have been reported
-    10,298,424,695      cycles                           #    2.949 GHz                    
-    22,355,388,978      instructions                     #    2.17  insn per cycle         
-       3.493059791 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.583516 sec
+    10,308,901,515      cycles                           #    2.874 GHz                       
+    22,457,815,111      instructions                     #    2.18  insn per cycle            
+       3.588832664 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039280066150E-002
 Relative difference = 5.612189004572479e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.406924e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.701531e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.701531e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.344557e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.579879e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.579879e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.322767 sec
-INFO: No Floating Point Exceptions have been reported
-     9,443,809,325      cycles                           #    2.838 GHz                    
-    15,664,102,195      instructions                     #    1.66  insn per cycle         
-       3.328357008 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2565) (512y:    0) (512z:    0)
+TOTAL       :     3.404488 sec
+     9,434,839,609      cycles                           #    2.768 GHz                       
+    15,726,735,545      instructions                     #    1.67  insn per cycle            
+       3.409840593 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2575) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.446360e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.803645e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.803645e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.407789e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.709415e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.709415e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.287328 sec
-INFO: No Floating Point Exceptions have been reported
-     9,371,124,961      cycles                           #    2.847 GHz                    
-    15,299,944,141      instructions                     #    1.63  insn per cycle         
-       3.292839828 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2462) (512y:   12) (512z:    0)
+TOTAL       :     3.341843 sec
+     9,335,373,029      cycles                           #    2.790 GHz                       
+    15,365,478,048      instructions                     #    1.65  insn per cycle            
+       3.347112669 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2472) (512y:   10) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.466708e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.777222e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.777222e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.374032e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.592267e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.592267e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.269312 sec
-INFO: No Floating Point Exceptions have been reported
-     7,659,274,117      cycles                           #    2.340 GHz                    
-    12,573,895,764      instructions                     #    1.64  insn per cycle         
-       3.274843213 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:   16) (512z: 1440)
+TOTAL       :     3.383460 sec
+     7,651,857,041      cycles                           #    2.259 GHz                       
+    12,604,317,732      instructions                     #    1.65  insn per cycle            
+       3.388617759 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1701) (512y:    5) (512z: 1445)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828052585973637E-002
 Relative difference = 2.0158743040564767e-07
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
index 7f30dafdfd..4fe47b6309 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:46:29
+DATE: 2025-10-11_16:39:57
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.282321e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.333955e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.369324e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.232997e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.388992e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.560013e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.867789 sec
-INFO: No Floating Point Exceptions have been reported
-     3,167,199,789      cycles                           #    2.899 GHz                    
-     6,506,216,930      instructions                     #    2.05  insn per cycle         
-       1.149942283 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.882532 sec
+     3,214,322,203      cycles                           #    2.828 GHz                       
+     6,452,752,496      instructions                     #    2.01  insn per cycle            
+       1.194579493 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.085219e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.281583e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.281583e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.031419e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.212428e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.212428e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.166056 sec
-INFO: No Floating Point Exceptions have been reported
-    18,234,644,828      cycles                           #    2.955 GHz                    
-    45,008,398,832      instructions                     #    2.47  insn per cycle         
-       6.171760600 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.477368 sec
+    18,661,812,568      cycles                           #    2.879 GHz                       
+    45,252,341,321      instructions                     #    2.42  insn per cycle            
+       6.482693144 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.256894e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.462086e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.462086e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.196497e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.342466e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.342466e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.159870 sec
-INFO: No Floating Point Exceptions have been reported
-     9,347,982,513      cycles                           #    2.954 GHz                    
-    22,275,896,372      instructions                     #    2.38  insn per cycle         
-       3.165402193 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.247962 sec
+     9,353,957,329      cycles                           #    2.876 GHz                       
+    22,375,680,082      instructions                     #    2.39  insn per cycle            
+       3.253308897 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039280066150E-002
 Relative difference = 5.612189004572479e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.410366e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.712636e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.712636e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.352259e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.566980e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.566980e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.983322 sec
-INFO: No Floating Point Exceptions have been reported
-     8,463,194,185      cycles                           #    2.833 GHz                    
-    15,755,395,679      instructions                     #    1.86  insn per cycle         
-       2.988746216 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2565) (512y:    0) (512z:    0)
+TOTAL       :     3.051523 sec
+     8,419,136,103      cycles                           #    2.756 GHz                       
+    15,815,678,204      instructions                     #    1.88  insn per cycle            
+       3.056921587 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2575) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.454105e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.801490e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.801490e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.409169e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.699321e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.699321e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.933599 sec
-INFO: No Floating Point Exceptions have been reported
-     8,319,397,972      cycles                           #    2.832 GHz                    
-    15,593,973,322      instructions                     #    1.87  insn per cycle         
-       2.939101584 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2462) (512y:   12) (512z:    0)
+TOTAL       :     2.991131 sec
+     8,296,340,422      cycles                           #    2.770 GHz                       
+    15,649,217,834      instructions                     #    1.89  insn per cycle            
+       2.996375115 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2472) (512y:   10) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.469652e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.768397e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.768397e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.362594e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.567971e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.567971e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.922384 sec
-INFO: No Floating Point Exceptions have been reported
-     6,636,368,959      cycles                           #    2.267 GHz                    
-    12,865,256,567      instructions                     #    1.94  insn per cycle         
-       2.927905791 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:   16) (512z: 1440)
+TOTAL       :     3.046737 sec
+     6,657,108,236      cycles                           #    2.182 GHz                       
+    12,894,608,228      instructions                     #    1.94  insn per cycle            
+       3.052164277 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1701) (512y:    5) (512z: 1445)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828052585973637E-002
 Relative difference = 2.0158743040564767e-07
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
index e2ecb9b5fd..a89730724c 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,235 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:43:42
+DATE: 2025-10-11_16:36:29
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.979354e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.311142e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.251832e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.680186e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.389167e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.490052e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.493081 sec
-INFO: No Floating Point Exceptions have been reported
-     5,009,051,141      cycles                           #    2.916 GHz                    
-     9,204,393,500      instructions                     #    1.84  insn per cycle         
-       1.774548277 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
+TOTAL       :     1.528523 sec
+     5,119,450,809      cycles                           #    2.867 GHz                       
+     9,180,981,618      instructions                     #    1.79  insn per cycle            
+       1.841912956 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.077151e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.276926e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.276926e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.028340e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.213140e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.213140e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.211513 sec
-INFO: No Floating Point Exceptions have been reported
-    18,299,232,198      cycles                           #    2.944 GHz                    
-    45,005,768,829      instructions                     #    2.46  insn per cycle         
-       6.217115880 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.495821 sec
+    18,726,914,707      cycles                           #    2.881 GHz                       
+    45,252,147,765      instructions                     #    2.42  insn per cycle            
+       6.501028276 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.268380e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.460029e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.460029e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.215291e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.366977e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.366977e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.148224 sec
-INFO: No Floating Point Exceptions have been reported
-     9,293,240,022      cycles                           #    2.948 GHz                    
-    22,275,553,802      instructions                     #    2.40  insn per cycle         
-       3.153857529 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.221927 sec
+     9,338,555,823      cycles                           #    2.895 GHz                       
+    22,375,290,209      instructions                     #    2.40  insn per cycle            
+       3.227594710 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039280066150E-002
 Relative difference = 5.612189004572479e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.395770e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.675698e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.675698e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.376691e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.618820e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.618820e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.994942 sec
-INFO: No Floating Point Exceptions have been reported
-     8,447,981,393      cycles                           #    2.817 GHz                    
-    15,754,576,494      instructions                     #    1.86  insn per cycle         
-       3.000419944 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2565) (512y:    0) (512z:    0)
+TOTAL       :     3.021316 sec
+     8,423,872,827      cycles                           #    2.784 GHz                       
+    15,815,022,260      instructions                     #    1.88  insn per cycle            
+       3.026847541 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2575) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.419912e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.751119e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.751119e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.398006e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.678623e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.678623e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.971435 sec
-INFO: No Floating Point Exceptions have been reported
-     8,357,800,499      cycles                           #    2.808 GHz                    
-    15,594,139,449      instructions                     #    1.87  insn per cycle         
-       2.977163262 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2462) (512y:   12) (512z:    0)
+TOTAL       :     3.003583 sec
+     8,296,430,270      cycles                           #    2.758 GHz                       
+    15,653,949,933      instructions                     #    1.89  insn per cycle            
+       3.009064332 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2472) (512y:   10) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.455367e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.730952e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.730952e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.376583e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.598108e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.598108e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.933639 sec
-INFO: No Floating Point Exceptions have been reported
-     6,669,997,057      cycles                           #    2.271 GHz                    
-    12,867,351,511      instructions                     #    1.93  insn per cycle         
-       2.938851588 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:   16) (512z: 1440)
+TOTAL       :     3.029921 sec
+     6,657,348,870      cycles                           #    2.194 GHz                       
+    12,894,427,961      instructions                     #    1.94  insn per cycle            
+       3.035366895 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1701) (512y:    5) (512z: 1445)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828052585973637E-002
 Relative difference = 2.0158743040564767e-07
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
index 9e915de581..1a227eb682 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:57:23
+DATE: 2025-10-11_15:16:39
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.310707e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.890276e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.030864e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.199628e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.780940e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.098104e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.577005 sec
-INFO: No Floating Point Exceptions have been reported
-     2,340,023,876      cycles                           #    2.880 GHz                    
-     3,638,052,704      instructions                     #    1.55  insn per cycle         
-       0.886148283 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 79
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.592040 sec
+     2,436,367,118      cycles                           #    2.822 GHz                       
+     3,629,290,640      instructions                     #    1.49  insn per cycle            
+       0.920365880 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.074456e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.269687e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.269687e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.039860e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.223391e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.223391e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.244443 sec
-INFO: No Floating Point Exceptions have been reported
-    18,377,232,357      cycles                           #    2.941 GHz                    
-    45,025,324,964      instructions                     #    2.45  insn per cycle         
-       6.253002386 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  397) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.427980 sec
+    18,659,345,357      cycles                           #    2.901 GHz                       
+    45,239,622,020      instructions                     #    2.42  insn per cycle            
+       6.433370102 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  408) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.251309e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.439034e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.439034e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.201529e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.346468e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.346468e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.184453 sec
-INFO: No Floating Point Exceptions have been reported
-     9,383,250,913      cycles                           #    2.940 GHz                    
-    22,280,358,761      instructions                     #    2.37  insn per cycle         
-       3.194375038 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1935) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.240561 sec
+     9,296,413,050      cycles                           #    2.865 GHz                       
+    22,342,996,788      instructions                     #    2.40  insn per cycle            
+       3.245872745 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1946) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039280066150E-002
 Relative difference = 5.612189004572479e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.403334e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.700033e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.700033e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.385031e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.622316e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.622316e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.004384 sec
-INFO: No Floating Point Exceptions have been reported
-     8,513,730,278      cycles                           #    2.827 GHz                    
-    15,791,909,505      instructions                     #    1.85  insn per cycle         
-       3.013283160 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2540) (512y:    0) (512z:    0)
+TOTAL       :     3.012220 sec
+     8,383,528,688      cycles                           #    2.779 GHz                       
+    15,803,482,216      instructions                     #    1.89  insn per cycle            
+       3.017661777 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2547) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.444935e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.799463e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.799463e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.412617e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.685973e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.685973e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.959849 sec
-INFO: No Floating Point Exceptions have been reported
-     8,395,161,248      cycles                           #    2.830 GHz                    
-    15,634,676,534      instructions                     #    1.86  insn per cycle         
-       2.968734397 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2438) (512y:   10) (512z:    0)
+TOTAL       :     2.983146 sec
+     8,252,716,563      cycles                           #    2.763 GHz                       
+    15,642,709,201      instructions                     #    1.90  insn per cycle            
+       2.988589217 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2444) (512y:   10) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053255361738E-002
 Relative difference = 2.5376902468575066e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.454317e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.767111e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.767111e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.388549e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.619875e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.619875e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.955150 sec
-INFO: No Floating Point Exceptions have been reported
-     6,701,822,130      cycles                           #    2.263 GHz                    
-    12,886,633,037      instructions                     #    1.92  insn per cycle         
-       2.963931226 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1669) (512y:   16) (512z: 1427)
+TOTAL       :     3.016137 sec
+     6,649,228,149      cycles                           #    2.204 GHz                       
+    12,869,205,720      instructions                     #    1.94  insn per cycle            
+       3.020818387 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1672) (512y:    5) (512z: 1432)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828052564145764E-002
-Relative difference = 1.9988585667912256e-07
+Avg ME (F77/C++)    = 1.2828052575059701E-002
+Relative difference = 2.0073664354238512e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
index 1fabc46555..38262df32b 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:27:46
+DATE: 2025-10-11_16:17:26
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.309386e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.516838e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.621181e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.225159e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.730992e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.784746e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.581015 sec
-INFO: No Floating Point Exceptions have been reported
-     2,337,717,863      cycles                           #    2.893 GHz                    
-     3,666,959,770      instructions                     #    1.57  insn per cycle         
-       0.866189287 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.586772 sec
+     2,390,848,405      cycles                           #    2.830 GHz                       
+     3,635,852,069      instructions                     #    1.52  insn per cycle            
+       0.901933192 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.617887e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.109367e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.109367e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.580341e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.051291e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.051291e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     4.275933 sec
-INFO: No Floating Point Exceptions have been reported
-    12,412,341,686      cycles                           #    2.900 GHz                    
-    32,352,281,163      instructions                     #    2.61  insn per cycle         
-       4.283041784 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  290) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.360853 sec
+    12,448,339,745      cycles                           #    2.853 GHz                       
+    32,675,928,488      instructions                     #    2.62  insn per cycle            
+       4.365774305 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  289) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039840314887E-002
-Relative difference = 1.244813035273009e-08
+Avg ME (F77/C++)    = 1.2828039845771855E-002
+Relative difference = 1.2022736589486635e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.642717e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.471061e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.471061e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.653591e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.483795e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.483795e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.775228 sec
-INFO: No Floating Point Exceptions have been reported
-     8,161,861,180      cycles                           #    2.934 GHz                    
-    18,732,698,985      instructions                     #    2.30  insn per cycle         
-       2.782796507 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1534) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.750086 sec
+     7,984,215,270      cycles                           #    2.899 GHz                       
+    18,676,669,518      instructions                     #    2.34  insn per cycle            
+       2.755384632 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1518) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039283704129E-002
-Relative difference = 5.583829420356249e-08
+Avg ME (F77/C++)    = 1.2828039280066150E-002
+Relative difference = 5.612189004572479e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.771950e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.635210e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.635210e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.732255e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.524982e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.524982e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.653741 sec
-INFO: No Floating Point Exceptions have been reported
-     7,565,022,779      cycles                           #    2.844 GHz                    
-    14,293,093,213      instructions                     #    1.89  insn per cycle         
-       2.661141426 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2234) (512y:    0) (512z:    0)
+TOTAL       :     2.676787 sec
+     7,485,834,946      cycles                           #    2.792 GHz                       
+    14,289,880,775      instructions                     #    1.91  insn per cycle            
+       2.681721539 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2235) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053244447801E-002
-Relative difference = 2.5291823782248813e-07
+Avg ME (F77/C++)    = 1.2828053277189611E-002
+Relative difference = 2.5547059841227576e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.799741e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.762487e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.762487e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.815938e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.713073e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.713073e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.634363 sec
-INFO: No Floating Point Exceptions have been reported
-     7,504,285,407      cycles                           #    2.842 GHz                    
-    13,994,355,792      instructions                     #    1.86  insn per cycle         
-       2.641913370 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2087) (512y:    3) (512z:    0)
+TOTAL       :     2.610308 sec
+     7,285,805,876      cycles                           #    2.787 GHz                       
+    14,002,821,074      instructions                     #    1.92  insn per cycle            
+       2.615329640 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2090) (512y:    3) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053244447801E-002
-Relative difference = 2.5291823782248813e-07
+Avg ME (F77/C++)    = 1.2828053277189611E-002
+Relative difference = 2.5547059841227576e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.507958e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.890935e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.890935e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.445558e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.751827e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.751827e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.900923 sec
-INFO: No Floating Point Exceptions have been reported
-     6,641,718,947      cycles                           #    2.284 GHz                    
-    13,481,348,782      instructions                     #    2.03  insn per cycle         
-       2.908502130 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2073) (512y:    1) (512z: 1201)
+TOTAL       :     2.952535 sec
+     6,541,372,214      cycles                           #    2.212 GHz                       
+    13,442,784,339      instructions                     #    2.06  insn per cycle            
+       2.957547644 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2077) (512y:    0) (512z: 1195)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828052562326775E-002
-Relative difference = 1.997440588685788e-07
+Avg ME (F77/C++)    = 1.2828052571421722E-002
+Relative difference = 2.004530479212976e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
index ddc690e546..47c3a6f771 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:28:10
+DATE: 2025-10-11_16:17:52
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.311525e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.893939e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.130206e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.230358e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.785974e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.903505e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.580982 sec
-INFO: No Floating Point Exceptions have been reported
-     2,326,498,884      cycles                           #    2.887 GHz                    
-     3,595,400,053      instructions                     #    1.55  insn per cycle         
-       0.865243472 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 79
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.585637 sec
+     2,395,685,093      cycles                           #    2.840 GHz                       
+     3,632,202,579      instructions                     #    1.52  insn per cycle            
+       0.900792937 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.199736e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.210916e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.210916e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.167434e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.153946e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.153946e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.247253 sec
-INFO: No Floating Point Exceptions have been reported
-     9,460,485,661      cycles                           #    2.907 GHz                    
-    25,749,028,052      instructions                     #    2.72  insn per cycle         
-       3.254869601 seconds time elapsed
+TOTAL       :     3.280436 sec
+     9,351,045,236      cycles                           #    2.847 GHz                       
+    25,523,046,940      instructions                     #    2.73  insn per cycle            
+       3.285902426 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  243) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039838495897E-002
-Relative difference = 1.2589928273811243e-08
+Avg ME (F77/C++)    = 1.2828039845771855E-002
+Relative difference = 1.2022736589486635e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.982142e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.480555e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.480555e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.975132e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.504192e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.504192e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.498717 sec
-INFO: No Floating Point Exceptions have been reported
-     7,385,528,393      cycles                           #    2.949 GHz                    
-    16,812,365,380      instructions                     #    2.28  insn per cycle         
-       2.506313604 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1311) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.494622 sec
+     7,225,776,791      cycles                           #    2.892 GHz                       
+    16,897,519,367      instructions                     #    2.34  insn per cycle            
+       2.499894449 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1334) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039280066150E-002
 Relative difference = 5.612189004572479e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.917887e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.065921e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.065921e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.863069e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.858307e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.858307e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.542096 sec
-INFO: No Floating Point Exceptions have been reported
-     7,260,793,625      cycles                           #    2.848 GHz                    
-    13,703,433,227      instructions                     #    1.89  insn per cycle         
-       2.549878549 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2067) (512y:    0) (512z:    0)
+TOTAL       :     2.571321 sec
+     7,197,624,768      cycles                           #    2.795 GHz                       
+    13,687,331,488      instructions                     #    1.90  insn per cycle            
+       2.576243151 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2063) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053220800939E-002
 Relative difference = 2.5107486628541925e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.947392e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.166768e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.166768e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.912761e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.069621e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.069621e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.537410 sec
-INFO: No Floating Point Exceptions have been reported
-     7,253,478,894      cycles                           #    2.851 GHz                    
-    13,505,585,795      instructions                     #    1.86  insn per cycle         
-       2.545044336 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1935) (512y:    7) (512z:    0)
+TOTAL       :     2.533153 sec
+     7,100,141,299      cycles                           #    2.799 GHz                       
+    13,497,970,451      instructions                     #    1.90  insn per cycle            
+       2.538056554 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1946) (512y:    3) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828053220800939E-002
 Relative difference = 2.5107486628541925e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.612725e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.139660e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.139660e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.512964e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.923122e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.923122e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.798296 sec
-INFO: No Floating Point Exceptions have been reported
-     6,447,529,861      cycles                           #    2.298 GHz                    
-    13,215,855,857      instructions                     #    2.05  insn per cycle         
-       2.806480502 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2039) (512y:    2) (512z: 1081)
+TOTAL       :     2.885451 sec
+     6,375,003,514      cycles                           #    2.206 GHz                       
+    13,181,689,692      instructions                     #    2.07  insn per cycle            
+       2.890749023 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2031) (512y:    1) (512z: 1091)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
 Avg ME (F77/C++)    = 1.2828052536860923E-002
 Relative difference = 1.977588895209662e-07
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..78116e7085
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+DATE: 2025-10-11_15:39:57
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.811025e+06    1 256
+5.675268e+06    2 256
+1.125473e+07    4 256
+2.237542e+07    8 256
+4.084889e+07   16 256
+8.038307e+07   32 256
+1.408431e+08   64 256
+2.087041e+08  128 256
+2.617085e+08  256 256
+3.164102e+08  512 256
+3.490720e+08 1024 256
+### GPU: scaling test 32
+3.990821e+05    1  32
+7.057552e+05    2  32
+1.416039e+06    4  32
+2.964129e+06    8  32
+5.593795e+06   16  32
+1.165053e+07   32  32
+2.163693e+07   64  32
+4.137165e+07  128  32
+7.520702e+07  256  32
+1.314590e+08  512  32
+1.948562e+08 1024  32
+2.786288e+08 2048  32
+3.116503e+08 4096  32
+3.644493e+08 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.058031e+06    1 256
+1.064708e+06    2 256
+1.091924e+06    4 256
+### CPU: scaling test 32
+9.653674e+05    1  32
+1.073826e+06    2  32
+1.086320e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.851906e+06    1 256
+1.832695e+06    2 256
+1.916161e+06    4 256
+### CPU: scaling test 32
+1.906351e+06    1  32
+1.246470e+06    2  32
+1.664802e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.709626e+06    1 256
+2.644942e+06    2 256
+2.445350e+06    4 256
+### CPU: scaling test 32
+2.186539e+06    1  32
+2.363281e+06    2  32
+2.641954e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.767179e+06    1 256
+2.686691e+06    2 256
+2.759654e+06    4 256
+### CPU: scaling test 32
+1.340876e+06    1  32
+2.416645e+06    2  32
+2.506708e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.171313e+06    1 256
+2.276072e+06    2 256
+2.282286e+06    4 256
+### CPU: scaling test 32
+1.265823e+06    1  32
+1.671673e+06    2  32
+2.039028e+06    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 8e00f9820d..caf7cf3a58 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:55:54
+DATE: 2025-10-11_15:14:54
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.055673e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.658424e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.851508e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.254014e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.994980e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.902542e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.678781 sec
-INFO: No Floating Point Exceptions have been reported
-     2,628,768,348      cycles                           #    2.876 GHz                    
-     4,103,389,790      instructions                     #    1.56  insn per cycle         
-       1.044225431 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.693324 sec
+     2,725,071,311      cycles                           #    2.836 GHz                       
+     4,080,796,637      instructions                     #    1.50  insn per cycle            
+       1.023122717 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
-Avg ME (F77/GPU)   = 1.2828039901590279E-002
-Relative difference = 7.671454200650844e-09
+Avg ME (F77/GPU)   = 1.2828039945363461E-002
+Relative difference = 4.259149494690016e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.011376e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.175905e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.175905e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.004559e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.167053e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.167053e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.671756 sec
-INFO: No Floating Point Exceptions have been reported
-    19,661,999,702      cycles                           #    2.943 GHz                    
-    46,395,546,050      instructions                     #    2.36  insn per cycle         
-       6.683261433 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  466) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.681187 sec
+    19,310,569,163      cycles                           #    2.888 GHz                       
+    46,561,074,047      instructions                     #    2.41  insn per cycle            
+       6.686779372 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  482) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039952548879E-002
 Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.631538e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.161697e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.161697e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.592071e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.095366e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.095366e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.308846 sec
-INFO: No Floating Point Exceptions have been reported
-    12,713,127,116      cycles                           #    2.944 GHz                    
-    31,571,564,120      instructions                     #    2.48  insn per cycle         
-       4.322869208 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1731) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.374152 sec
+    12,572,513,674      cycles                           #    2.872 GHz                       
+    31,463,286,168      instructions                     #    2.50  insn per cycle            
+       4.379862583 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1723) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039952548879E-002
 Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.963768e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.746755e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.746755e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.938324e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.700921e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.700921e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.649356 sec
-INFO: No Floating Point Exceptions have been reported
-    10,294,572,937      cycles                           #    2.814 GHz                    
-    19,586,622,017      instructions                     #    1.90  insn per cycle         
-       3.662289672 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2045) (512y:    0) (512z:    0)
+TOTAL       :     3.662440 sec
+    10,121,778,715      cycles                           #    2.760 GHz                       
+    19,471,159,122      instructions                     #    1.92  insn per cycle            
+       3.668260640 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2032) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.001856e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.818080e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.818080e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.971771e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.738449e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.738449e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.589952 sec
-INFO: No Floating Point Exceptions have been reported
-    10,108,826,304      cycles                           #    2.808 GHz                    
-    19,396,692,714      instructions                     #    1.92  insn per cycle         
-       3.602641354 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1799) (512y:  188) (512z:    0)
+TOTAL       :     3.605464 sec
+     9,883,989,440      cycles                           #    2.738 GHz                       
+    19,284,997,724      instructions                     #    1.95  insn per cycle            
+       3.611144081 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1786) (512y:  191) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.801777e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.420597e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.420597e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.763507e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.351410e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.351410e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.939332 sec
-INFO: No Floating Point Exceptions have been reported
-     8,555,878,739      cycles                           #    2.167 GHz                    
-    15,216,666,169      instructions                     #    1.78  insn per cycle         
-       3.951287451 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  966) (512y:  154) (512z: 1330)
+TOTAL       :     3.983402 sec
+     8,347,852,448      cycles                           #    2.093 GHz                       
+    14,994,758,047      instructions                     #    1.80  insn per cycle            
+       3.989072483 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  952) (512y:  154) (512z: 1313)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
index 0283d4438d..f781dc1bb5 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:56:25
+DATE: 2025-10-11_15:15:31
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.048170e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.671940e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.867900e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.263252e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.017320e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.920339e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.677955 sec
-INFO: No Floating Point Exceptions have been reported
-     2,610,429,449      cycles                           #    2.847 GHz                    
-     4,074,904,816      instructions                     #    1.56  insn per cycle         
-       1.028610198 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.689357 sec
+     2,740,273,431      cycles                           #    2.852 GHz                       
+     4,084,188,832      instructions                     #    1.49  insn per cycle            
+       1.021206637 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
-Avg ME (F77/GPU)   = 1.2828039901590279E-002
-Relative difference = 7.671454200650844e-09
+Avg ME (F77/GPU)   = 1.2828039945363461E-002
+Relative difference = 4.259149494690016e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.012794e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.178467e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.178467e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.004380e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.167437e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.167437e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.662178 sec
-INFO: No Floating Point Exceptions have been reported
-    19,608,707,308      cycles                           #    2.939 GHz                    
-    46,331,953,932      instructions                     #    2.36  insn per cycle         
-       6.674225175 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  453) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.681530 sec
+    19,329,038,472      cycles                           #    2.891 GHz                       
+    46,534,784,670      instructions                     #    2.41  insn per cycle            
+       6.687165929 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  474) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039952548879E-002
 Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.631371e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.156116e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.156116e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.608782e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.123511e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.123511e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.305744 sec
-INFO: No Floating Point Exceptions have been reported
-    12,687,194,497      cycles                           #    2.940 GHz                    
-    31,570,654,619      instructions                     #    2.49  insn per cycle         
-       4.317357131 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1724) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.330389 sec
+    12,526,304,265      cycles                           #    2.890 GHz                       
+    31,429,125,016      instructions                     #    2.51  insn per cycle            
+       4.336065673 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1719) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039952548879E-002
 Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.951503e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.723168e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.723168e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.942808e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.702933e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.702933e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.669508 sec
-INFO: No Floating Point Exceptions have been reported
-    10,337,023,986      cycles                           #    2.809 GHz                    
-    19,600,398,756      instructions                     #    1.90  insn per cycle         
-       3.680210311 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2036) (512y:    0) (512z:    0)
+TOTAL       :     3.652389 sec
+    10,126,359,115      cycles                           #    2.769 GHz                       
+    19,454,993,368      instructions                     #    1.92  insn per cycle            
+       3.658235344 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2019) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.000628e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.813640e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.813640e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.957600e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.738598e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.738598e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.591164 sec
-INFO: No Floating Point Exceptions have been reported
-    10,093,463,938      cycles                           #    2.804 GHz                    
-    19,298,137,282      instructions                     #    1.91  insn per cycle         
-       3.601580555 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1766) (512y:  191) (512z:    0)
+TOTAL       :     3.629719 sec
+     9,979,298,276      cycles                           #    2.746 GHz                       
+    19,273,169,438      instructions                     #    1.93  insn per cycle            
+       3.635438116 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1773) (512y:  191) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.833398e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.483164e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.483164e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.800984e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.418771e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.418771e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.878021 sec
-INFO: No Floating Point Exceptions have been reported
-     8,399,559,009      cycles                           #    2.161 GHz                    
-    15,073,176,103      instructions                     #    1.79  insn per cycle         
-       3.888708235 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  959) (512y:  155) (512z: 1296)
+TOTAL       :     3.911829 sec
+     8,199,622,084      cycles                           #    2.094 GHz                       
+    14,847,008,944      instructions                     #    1.81  insn per cycle            
+       3.917306895 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  941) (512y:  155) (512z: 1281)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..4703fd43b7
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_15:40:39
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.383253e+06    1 256
+2.893064e+06    2 256
+5.376118e+06    4 256
+1.185151e+07    8 256
+2.346081e+07   16 256
+4.511286e+07   32 256
+5.630221e+07   64 256
+6.196121e+07  128 256
+6.780047e+07  256 256
+7.309787e+07  512 256
+7.376814e+07 1024 256
+### GPU: scaling test 32
+1.722124e+05    1  32
+3.905487e+05    2  32
+6.832898e+05    4  32
+1.517739e+06    8  32
+2.835858e+06   16  32
+6.130048e+06   32  32
+1.120344e+07   64  32
+2.084478e+07  128  32
+4.106718e+07  256  32
+5.763008e+07  512  32
+6.090072e+07 1024  32
+6.706632e+07 2048  32
+7.231618e+07 4096  32
+7.501823e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.767984e+05    1 256
+1.796605e+05    2 256
+1.802476e+05    4 256
+### CPU: scaling test 32
+1.472612e+05    1  32
+1.715919e+05    2  32
+1.711413e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.982512e+05    1 256
+3.086531e+05    2 256
+3.162558e+05    4 256
+### CPU: scaling test 32
+2.995750e+05    1  32
+2.938112e+05    2  32
+2.996907e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.811704e+05    1 256
+4.983434e+05    2 256
+5.240082e+05    4 256
+### CPU: scaling test 32
+4.296686e+05    1  32
+4.897722e+05    2  32
+4.790509e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.039122e+05    1 256
+5.537973e+05    2 256
+5.292318e+05    4 256
+### CPU: scaling test 32
+5.049628e+05    1  32
+5.163039e+05    2  32
+5.558813e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.352738e+05    1 256
+3.531052e+05    2 256
+3.524363e+05    4 256
+### CPU: scaling test 32
+3.508580e+05    1  32
+3.508926e+05    2  32
+3.509426e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index 0abecbd859..b83fe948f8 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:57:50
+DATE: 2025-10-11_15:17:08
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.424562e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.378226e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.000814e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.814869e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.187282e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.582493e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.532719 sec
-INFO: No Floating Point Exceptions have been reported
-     2,198,564,055      cycles                           #    2.860 GHz                    
-     3,137,529,593      instructions                     #    1.43  insn per cycle         
-       0.850854779 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.541191 sec
+     2,309,968,372      cycles                           #    2.848 GHz                       
+     3,226,495,089      instructions                     #    1.40  insn per cycle            
+       0.869698260 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.821542e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.869016e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.869016e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.792870e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.839272e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.839272e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.902388 sec
-INFO: No Floating Point Exceptions have been reported
-    17,373,663,633      cycles                           #    2.939 GHz                    
-    46,051,346,456      instructions                     #    2.65  insn per cycle         
-       5.916149203 seconds time elapsed
+TOTAL       :     5.956913 sec
+    17,261,214,247      cycles                           #    2.896 GHz                       
+    46,320,121,297      instructions                     #    2.68  insn per cycle            
+       5.962421755 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.199984e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.364044e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.364044e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.087487e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.238823e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.238823e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.425369 sec
-INFO: No Floating Point Exceptions have been reported
-    10,116,123,100      cycles                           #    2.945 GHz                    
-    27,968,506,728      instructions                     #    2.76  insn per cycle         
-       3.436971917 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.506189 sec
+    10,088,639,728      cycles                           #    2.873 GHz                       
+    27,919,288,717      instructions                     #    2.77  insn per cycle            
+       3.512045055 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.021241e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.422127e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.422127e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.914379e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.288444e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.288444e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.236686 sec
-INFO: No Floating Point Exceptions have been reported
-     6,226,726,050      cycles                           #    2.773 GHz                    
-    12,700,169,832      instructions                     #    2.04  insn per cycle         
-       2.249020906 seconds time elapsed
+TOTAL       :     2.241997 sec
+     6,102,243,675      cycles                           #    2.716 GHz                       
+    12,609,784,840      instructions                     #    2.07  insn per cycle            
+       2.247857659 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.518459e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.996461e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.996461e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.130809e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.541182e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.541182e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.046768 sec
-INFO: No Floating Point Exceptions have been reported
-     5,709,909,658      cycles                           #    2.777 GHz                    
-    12,140,194,379      instructions                     #    2.13  insn per cycle         
-       2.059786524 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  144) (512z:    0)
+TOTAL       :     2.151754 sec
+     5,849,443,539      cycles                           #    2.712 GHz                       
+    12,186,163,621      instructions                     #    2.08  insn per cycle            
+       2.157524773 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2411) (512y:  124) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.403513e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.583329e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.583329e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.453655e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.631223e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.631223e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.229693 sec
-INFO: No Floating Point Exceptions have been reported
-     6,051,702,488      cycles                           #    1.869 GHz                    
-     8,428,750,265      instructions                     #    1.39  insn per cycle         
-       3.242969033 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1441) (512y:  122) (512z: 1802)
+TOTAL       :     3.144840 sec
+     5,734,260,839      cycles                           #    1.821 GHz                       
+     8,277,135,516      instructions                     #    1.44  insn per cycle            
+       3.150611128 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1451) (512y:  100) (512z: 1801)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..28ed30edba
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_15:54:51
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+4.305698e+05    1 256
+8.421080e+05    2 256
+1.658112e+06    4 256
+2.989838e+06    8 256
+4.972377e+06   16 256
+7.105357e+06   32 256
+9.196651e+06   64 256
+1.028995e+07  128 256
+1.118682e+07  256 256
+1.170520e+07  512 256
+1.194760e+07 1024 256
+### GPU: scaling test 32
+5.803167e+04    1  32
+1.141868e+05    2  32
+2.280709e+05    4  32
+4.392090e+05    8  32
+8.271820e+05   16  32
+1.628245e+06   32  32
+3.150764e+06   64  32
+5.031576e+06  128  32
+7.100399e+06  256  32
+9.298129e+06  512  32
+1.037459e+07 1024  32
+1.113939e+07 2048  32
+1.172028e+07 4096  32
+1.198120e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.715304e+05    1 256
+1.781417e+05    2 256
+1.794714e+05    4 256
+### CPU: scaling test 32
+1.577069e+05    1  32
+1.683648e+05    2  32
+1.674260e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.985670e+05    1 256
+3.075757e+05    2 256
+3.131579e+05    4 256
+### CPU: scaling test 32
+2.725469e+05    1  32
+2.816294e+05    2  32
+2.958942e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.247762e+05    1 256
+5.241155e+05    2 256
+4.852917e+05    4 256
+### CPU: scaling test 32
+5.186974e+05    1  32
+5.291399e+05    2  32
+5.305920e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.514805e+05    1 256
+5.505359e+05    2 256
+5.563984e+05    4 256
+### CPU: scaling test 32
+5.060969e+05    1  32
+5.545783e+05    2  32
+4.913100e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.339783e+05    1 256
+3.535899e+05    2 256
+3.481939e+05    4 256
+### CPU: scaling test 32
+3.145334e+05    1  32
+3.563455e+05    2  32
+3.387686e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..898eec66e3
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_15:50:32
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.041344e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.200767e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.210879e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     1.316417 sec
+     4,841,050,091      cycles                           #    2.845 GHz                       
+     6,855,412,132      instructions                     #    1.42  insn per cycle            
+       1.762497593 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028807e+00
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.782393e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.828671e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.828671e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     5.991425 sec
+    17,268,124,515      cycles                           #    2.880 GHz                       
+    46,321,023,545      instructions                     #    2.68  insn per cycle            
+       5.996950400 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388515649
+Relative difference = 3.258803992249869e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.120284e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.273768e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.273768e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.468964 sec
+    10,062,208,508      cycles                           #    2.897 GHz                       
+    27,919,768,700      instructions                     #    2.77  insn per cycle            
+       3.474512429 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388515654
+Relative difference = 3.2588039900609506e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 4.922035e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.300092e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.300092e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.238317 sec
+     6,090,888,500      cycles                           #    2.716 GHz                       
+    12,608,791,480      instructions                     #    2.07  insn per cycle            
+       2.243747530 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 5.153909e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.564898e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.564898e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.141769 sec
+     5,839,015,371      cycles                           #    2.721 GHz                       
+    12,183,200,067      instructions                     #    2.09  insn per cycle            
+       2.147164385 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2411) (512y:  124) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.421281e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.595508e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.595508e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.172923 sec
+     5,704,193,065      cycles                           #    1.795 GHz                       
+     8,277,048,290      instructions                     #    1.45  insn per cycle            
+       3.178502846 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1451) (512y:  100) (512z: 1801)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
index 0a62f31f21..8fbb21e9ff 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,252 +10,216 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:37:36
+DATE: 2025-10-11_16:28:38
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.523249e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.008578e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.008578e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.427555e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.769300e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.769300e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.943118 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,438,006,415      cycles                           #    2.887 GHz                    
-     4,812,518,572      instructions                     #    1.40  insn per cycle         
-       1.248014993 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
+TOTAL       :     0.828718 sec
+     3,186,820,693      cycles                           #    2.852 GHz                       
+     4,808,126,394      instructions                     #    1.51  insn per cycle            
+       1.176249753 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.806787e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.852935e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.852935e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.774052e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.819717e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.819717e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     6.028463 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    17,701,660,691      cycles                           #    2.931 GHz                    
-    46,100,592,443      instructions                     #    2.60  insn per cycle         
-       6.041454793 seconds time elapsed
+TOTAL       :     6.098613 sec
+    17,597,864,140      cycles                           #    2.883 GHz                       
+    46,380,415,047      instructions                     #    2.64  insn per cycle            
+       6.105859903 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.171570e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.328412e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.328412e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.088043e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.238153e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.238153e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.537488 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    10,436,410,766      cycles                           #    2.940 GHz                    
-    28,150,415,987      instructions                     #    2.70  insn per cycle         
-       3.550700440 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.585879 sec
+    10,400,318,731      cycles                           #    2.896 GHz                       
+    28,093,070,719      instructions                     #    2.70  insn per cycle            
+       3.593178065 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.940586e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.316252e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.316252e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.807610e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.170791e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.170791e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.355700 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,586,554,223      cycles                           #    2.781 GHz                    
-    12,999,619,553      instructions                     #    1.97  insn per cycle         
-       2.369192751 seconds time elapsed
+TOTAL       :     2.371916 sec
+     6,428,829,911      cycles                           #    2.703 GHz                       
+    12,887,812,684      instructions                     #    2.00  insn per cycle            
+       2.379156266 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.425137e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.877080e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.877080e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.017593e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.406809e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.406809e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.160954 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,058,497,746      cycles                           #    2.788 GHz                    
-    12,422,408,910      instructions                     #    2.05  insn per cycle         
-       2.174009213 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  144) (512z:    0)
+TOTAL       :     2.281231 sec
+     6,165,327,004      cycles                           #    2.695 GHz                       
+    12,463,334,301      instructions                     #    2.02  insn per cycle            
+       2.288346369 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2411) (512y:  124) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.454260e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.633384e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.633384e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.356453e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.524615e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.524615e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.271770 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,220,081,356      cycles                           #    1.894 GHz                    
-     8,655,636,644      instructions                     #    1.39  insn per cycle         
-       3.285127387 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1441) (512y:  122) (512z: 1802)
+TOTAL       :     3.315612 sec
+     6,121,266,749      cycles                           #    1.843 GHz                       
+     8,516,898,541      instructions                     #    1.39  insn per cycle            
+       3.322530830 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1451) (512y:  100) (512z: 1801)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
index 70d02af695..26e0f25894 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:49:47
+DATE: 2025-10-11_16:44:00
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.202403e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.187841e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.877468e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.725056e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.186541e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.580567e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     0.637594 sec
-INFO: No Floating Point Exceptions have been reported
-     2,481,390,363      cycles                           #    2.852 GHz                    
-     3,619,998,982      instructions                     #    1.46  insn per cycle         
-       0.928734017 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.638610 sec
+     2,571,549,393      cycles                           #    2.847 GHz                       
+     3,659,796,797      instructions                     #    1.42  insn per cycle            
+       0.960427498 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.808108e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.854363e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.854363e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.781185e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.826305e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.826305e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     5.981515 sec
-INFO: No Floating Point Exceptions have been reported
-    17,441,882,337      cycles                           #    2.914 GHz                    
-    45,980,812,555      instructions                     #    2.64  insn per cycle         
-       5.987317462 seconds time elapsed
+TOTAL       :     6.057966 sec
+    17,438,379,118      cycles                           #    2.877 GHz                       
+    46,337,653,518      instructions                     #    2.66  insn per cycle            
+       6.063608366 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.173867e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.332553e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.332553e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.115210e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.268081e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.268081e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     3.490197 sec
-INFO: No Floating Point Exceptions have been reported
-    10,215,611,800      cycles                           #    2.923 GHz                    
-    27,889,324,001      instructions                     #    2.73  insn per cycle         
-       3.495993800 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.536392 sec
+    10,229,702,343      cycles                           #    2.889 GHz                       
+    27,918,943,570      instructions                     #    2.73  insn per cycle            
+       3.542208033 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.999819e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.389873e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.389873e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.877271e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.247954e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.247954e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.281339 sec
-INFO: No Floating Point Exceptions have been reported
-     6,287,168,374      cycles                           #    2.750 GHz                    
-    12,602,929,813      instructions                     #    2.00  insn per cycle         
-       2.287435325 seconds time elapsed
+TOTAL       :     2.320644 sec
+     6,288,847,916      cycles                           #    2.704 GHz                       
+    12,592,903,872      instructions                     #    2.00  insn per cycle            
+       2.326302778 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.471434e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.936245e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.936245e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.123817e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.531393e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.531393e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.097327 sec
-INFO: No Floating Point Exceptions have been reported
-     5,814,420,150      cycles                           #    2.765 GHz                    
-    11,994,829,914      instructions                     #    2.06  insn per cycle         
-       2.103345298 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  144) (512z:    0)
+TOTAL       :     2.218321 sec
+     6,014,515,797      cycles                           #    2.706 GHz                       
+    12,133,309,602      instructions                     #    2.02  insn per cycle            
+       2.224085333 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2411) (512y:  124) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.462865e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.641783e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.641783e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.381723e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.553268e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.553268e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     3.218108 sec
-INFO: No Floating Point Exceptions have been reported
-     5,937,437,503      cycles                           #    1.843 GHz                    
-     8,290,568,638      instructions                     #    1.40  insn per cycle         
-       3.224462086 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1441) (512y:  122) (512z: 1802)
+TOTAL       :     3.273257 sec
+     5,933,511,412      cycles                           #    1.811 GHz                       
+     8,229,034,215      instructions                     #    1.39  insn per cycle            
+       3.278919832 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1451) (512y:  100) (512z: 1801)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
index 794a3c9310..4d5855b54d 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:46:56
+DATE: 2025-10-11_16:40:27
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.311257e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.342288e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.004457e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.767730e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.205228e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.589097e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.566605 sec
-INFO: No Floating Point Exceptions have been reported
-     2,313,605,054      cycles                           #    2.893 GHz                    
-     3,600,350,267      instructions                     #    1.56  insn per cycle         
-       0.856648834 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.575649 sec
+     2,386,111,811      cycles                           #    2.845 GHz                       
+     3,639,741,256      instructions                     #    1.53  insn per cycle            
+       0.895952286 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.824387e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.871256e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.871256e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.791051e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.837013e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.837013e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.855309 sec
-INFO: No Floating Point Exceptions have been reported
-    17,230,682,954      cycles                           #    2.940 GHz                    
-    45,932,528,772      instructions                     #    2.67  insn per cycle         
-       5.861424268 seconds time elapsed
+TOTAL       :     5.963163 sec
+    17,264,643,304      cycles                           #    2.893 GHz                       
+    46,321,097,140      instructions                     #    2.68  insn per cycle            
+       5.968989618 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.215073e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.378302e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.378302e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.101295e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.253753e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.253753e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.372268 sec
-INFO: No Floating Point Exceptions have been reported
-     9,959,367,668      cycles                           #    2.949 GHz                    
-    27,848,270,798      instructions                     #    2.80  insn per cycle         
-       3.378265573 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.491410 sec
+    10,059,054,482      cycles                           #    2.877 GHz                       
+    27,919,466,540      instructions                     #    2.78  insn per cycle            
+       3.497008176 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.999546e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.391220e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.391220e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.890079e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.263113e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.263113e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.206484 sec
-INFO: No Floating Point Exceptions have been reported
-     6,113,930,208      cycles                           #    2.765 GHz                    
-    12,581,849,902      instructions                     #    2.06  insn per cycle         
-       2.212402360 seconds time elapsed
+TOTAL       :     2.254459 sec
+     6,084,381,375      cycles                           #    2.693 GHz                       
+    12,610,002,661      instructions                     #    2.07  insn per cycle            
+       2.260263260 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.516180e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.984165e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.984165e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.141713e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.554289e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.554289e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.007710 sec
-INFO: No Floating Point Exceptions have been reported
-     5,576,628,773      cycles                           #    2.771 GHz                    
-    12,020,299,868      instructions                     #    2.16  insn per cycle         
-       2.013581558 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  144) (512z:    0)
+TOTAL       :     2.147865 sec
+     5,852,500,330      cycles                           #    2.720 GHz                       
+    12,186,332,321      instructions                     #    2.08  insn per cycle            
+       2.153550767 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2411) (512y:  124) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.502286e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.687963e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.687963e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.413552e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.588205e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.588205e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.102138 sec
-INFO: No Floating Point Exceptions have been reported
-     5,751,986,200      cycles                           #    1.852 GHz                    
-     8,297,969,466      instructions                     #    1.44  insn per cycle         
-       3.107697215 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1441) (512y:  122) (512z: 1802)
+TOTAL       :     3.180124 sec
+     5,723,407,148      cycles                           #    1.797 GHz                       
+     8,277,947,646      instructions                     #    1.45  insn per cycle            
+       3.185775207 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1451) (512y:  100) (512z: 1801)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..4b28e0c827
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_16:49:10
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 3.755096e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.215389e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.607884e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     0.539292 sec
+     2,216,200,050      cycles                           #    2.846 GHz                       
+     3,157,615,309      instructions                     #    1.42  insn per cycle            
+       0.835257331 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028807e+00
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.787183e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.832888e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.832888e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     5.975964 sec
+    17,260,345,803      cycles                           #    2.886 GHz                       
+    46,320,336,029      instructions                     #    2.68  insn per cycle            
+       5.981639118 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388515649
+Relative difference = 3.258803992249869e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.111247e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.265577e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.265577e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.479269 sec
+    10,044,184,434      cycles                           #    2.883 GHz                       
+    27,919,122,564      instructions                     #    2.78  insn per cycle            
+       3.485095741 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388515654
+Relative difference = 3.2588039900609506e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 4.905590e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.283676e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.283676e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.245986 sec
+     6,089,248,282      cycles                           #    2.705 GHz                       
+    12,609,705,263      instructions                     #    2.07  insn per cycle            
+       2.251881277 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 5.148141e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.559740e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.559740e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.144804 sec
+     5,824,946,914      cycles                           #    2.710 GHz                       
+    12,184,657,847      instructions                     #    2.09  insn per cycle            
+       2.150527846 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2411) (512y:  124) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.423895e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.599460e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.599460e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.171890 sec
+     5,741,396,850      cycles                           #    1.808 GHz                       
+     8,278,034,433      instructions                     #    1.44  insn per cycle            
+       3.177718293 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1451) (512y:  100) (512z: 1801)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
index 70a45db399..e5e06f1218 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,235 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:44:10
+DATE: 2025-10-11_16:37:03
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.785807e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.291280e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.973584e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.626435e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.214094e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.587498e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.717756 sec
-INFO: No Floating Point Exceptions have been reported
-     2,755,914,027      cycles                           #    2.900 GHz                    
-     4,368,405,962      instructions                     #    1.59  insn per cycle         
-       1.007006361 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
+TOTAL       :     0.726364 sec
+     2,849,514,717      cycles                           #    2.845 GHz                       
+     4,382,574,758      instructions                     #    1.54  insn per cycle            
+       1.057928884 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.829948e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.877608e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.877608e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.789888e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.835303e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.835303e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.839744 sec
-INFO: No Floating Point Exceptions have been reported
-    17,231,514,699      cycles                           #    2.948 GHz                    
-    45,931,758,909      instructions                     #    2.67  insn per cycle         
-       5.845651027 seconds time elapsed
+TOTAL       :     5.967334 sec
+    17,272,703,409      cycles                           #    2.893 GHz                       
+    46,321,862,531      instructions                     #    2.68  insn per cycle            
+       5.973038452 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.215717e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.376174e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.376174e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.088498e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.238712e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.238712e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.370523 sec
-INFO: No Floating Point Exceptions have been reported
-     9,939,666,586      cycles                           #    2.945 GHz                    
-    27,847,302,489      instructions                     #    2.80  insn per cycle         
-       3.376515027 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.504822 sec
+    10,065,494,953      cycles                           #    2.868 GHz                       
+    27,919,546,717      instructions                     #    2.77  insn per cycle            
+       3.510554362 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.058902e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.451650e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.451650e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.895401e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.272281e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.272281e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.181386 sec
-INFO: No Floating Point Exceptions have been reported
-     6,074,037,919      cycles                           #    2.778 GHz                    
-    12,580,567,087      instructions                     #    2.07  insn per cycle         
-       2.187203017 seconds time elapsed
+TOTAL       :     2.251790 sec
+     6,086,448,139      cycles                           #    2.697 GHz                       
+    12,610,253,243      instructions                     #    2.07  insn per cycle            
+       2.257658692 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.484469e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.947491e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.947491e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.104544e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.508827e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.508827e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.020942 sec
-INFO: No Floating Point Exceptions have been reported
-     5,589,694,694      cycles                           #    2.759 GHz                    
-    12,020,772,424      instructions                     #    2.15  insn per cycle         
-       2.026934215 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  144) (512z:    0)
+TOTAL       :     2.163370 sec
+     5,848,310,473      cycles                           #    2.697 GHz                       
+    12,186,147,335      instructions                     #    2.08  insn per cycle            
+       2.169166916 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2411) (512y:  124) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.541083e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.728456e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.728456e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.395329e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.569447e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.569447e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.072814 sec
-INFO: No Floating Point Exceptions have been reported
-     5,724,538,871      cycles                           #    1.860 GHz                    
-     8,297,304,281      instructions                     #    1.45  insn per cycle         
-       3.079169559 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1441) (512y:  122) (512z: 1802)
+TOTAL       :     3.198349 sec
+     5,734,393,208      cycles                           #    1.791 GHz                       
+     8,277,908,197      instructions                     #    1.44  insn per cycle            
+       3.204254400 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1451) (512y:  100) (512z: 1801)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
index 03be4a726d..09986e5034 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:58:15
+DATE: 2025-10-11_15:17:41
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.508928e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.321752e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.002344e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.740251e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.070566e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.446622e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.536365 sec
-INFO: No Floating Point Exceptions have been reported
-     2,214,194,265      cycles                           #    2.876 GHz                    
-     3,152,115,430      instructions                     #    1.42  insn per cycle         
-       0.834564895 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 212
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.542467 sec
+     2,308,061,310      cycles                           #    2.843 GHz                       
+     3,180,365,192      instructions                     #    1.38  insn per cycle            
+       0.870299018 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.855453e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.904405e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.904405e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.832732e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.880113e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.880113e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.800560 sec
-INFO: No Floating Point Exceptions have been reported
-    16,903,949,090      cycles                           #    2.909 GHz                    
-    45,043,853,273      instructions                     #    2.66  insn per cycle         
-       5.813534817 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  567) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.829901 sec
+    16,848,535,293      cycles                           #    2.888 GHz                       
+    45,296,509,977      instructions                     #    2.69  insn per cycle            
+       5.835776505 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  568) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.339712e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.518637e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.518637e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.271423e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.440008e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.440008e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.288467 sec
-INFO: No Floating Point Exceptions have been reported
-     9,645,043,566      cycles                           #    2.925 GHz                    
-    26,807,862,552      instructions                     #    2.78  insn per cycle         
-       3.301069690 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2327) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.314065 sec
+     9,572,123,137      cycles                           #    2.885 GHz                       
+    26,751,815,901      instructions                     #    2.79  insn per cycle            
+       3.319563861 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2313) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.590385e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.923511e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.923511e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.514184e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.827414e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.827414e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.431911 sec
-INFO: No Floating Point Exceptions have been reported
-     6,762,097,168      cycles                           #    2.769 GHz                    
-    14,239,182,198      instructions                     #    2.11  insn per cycle         
-       2.443454156 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2711) (512y:    0) (512z:    0)
+TOTAL       :     2.431404 sec
+     6,623,808,841      cycles                           #    2.719 GHz                       
+    14,177,690,165      instructions                     #    2.14  insn per cycle            
+       2.437208264 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2724) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.784038e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.137564e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.137564e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.701345e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.040507e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.040507e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.339078 sec
-INFO: No Floating Point Exceptions have been reported
-     6,493,835,738      cycles                           #    2.765 GHz                    
-    13,835,177,964      instructions                     #    2.13  insn per cycle         
-       2.350490634 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  298) (512z:    0)
+TOTAL       :     2.338470 sec
+     6,401,665,095      cycles                           #    2.732 GHz                       
+    13,769,940,318      instructions                     #    2.15  insn per cycle            
+       2.344318448 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2371) (512y:  297) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.400894e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.576119e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.576119e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.303189e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.466084e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.466084e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.231977 sec
-INFO: No Floating Point Exceptions have been reported
-     6,054,126,925      cycles                           #    1.868 GHz                    
-    10,181,313,288      instructions                     #    1.68  insn per cycle         
-       3.245420113 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1273) (512y:  208) (512z: 1988)
+TOTAL       :     3.283375 sec
+     5,957,178,129      cycles                           #    1.812 GHz                       
+    10,086,124,192      instructions                     #    1.69  insn per cycle            
+       3.289028880 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1276) (512y:  208) (512z: 1988)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
index f94c1448dd..0d42001848 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:28:32
+DATE: 2025-10-11_16:18:17
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.445619e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.389644e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.998797e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.785771e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.171465e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.568632e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.532166 sec
-INFO: No Floating Point Exceptions have been reported
-     2,223,705,741      cycles                           #    2.888 GHz                    
-     3,137,862,648      instructions                     #    1.41  insn per cycle         
-       0.826622030 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.539437 sec
+     2,324,660,140      cycles                           #    2.833 GHz                       
+     3,221,828,743      instructions                     #    1.39  insn per cycle            
+       0.878217469 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.243473e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.316891e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.316891e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.387107e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.469288e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.469288e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.822701 sec
-INFO: No Floating Point Exceptions have been reported
-    14,262,425,677      cycles                           #    2.951 GHz                    
-    34,462,229,045      instructions                     #    2.42  insn per cycle         
-       4.834685593 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  665) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.501541 sec
+    13,071,399,497      cycles                           #    2.901 GHz                       
+    34,739,078,110      instructions                     #    2.66  insn per cycle            
+       4.507191858 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  648) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388515649
-Relative difference = 3.258803992249869e-07
+Avg ME (F77/C++)    = 2.0288063388515654
+Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.991823e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.134338e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.134338e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.901021e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.033616e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.033616e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.653458 sec
-INFO: No Floating Point Exceptions have been reported
-    10,828,452,798      cycles                           #    2.955 GHz                    
-    24,364,594,695      instructions                     #    2.25  insn per cycle         
-       3.665357624 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2610) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.723435 sec
+    10,832,687,449      cycles                           #    2.906 GHz                       
+    24,282,426,073      instructions                     #    2.24  insn per cycle            
+       3.728894903 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2579) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388515654
-Relative difference = 3.2588039900609506e-07
+Avg ME (F77/C++)    = 2.0288063388515649
+Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.588361e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.923011e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.923011e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.388729e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.690145e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.690145e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.432860 sec
-INFO: No Floating Point Exceptions have been reported
-     6,763,126,248      cycles                           #    2.768 GHz                    
-    12,520,790,366      instructions                     #    1.85  insn per cycle         
-       2.444836798 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3115) (512y:    0) (512z:    0)
+TOTAL       :     2.497295 sec
+     6,743,813,449      cycles                           #    2.696 GHz                       
+    12,543,269,382      instructions                     #    1.86  insn per cycle            
+       2.502704497 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3156) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388516204
-Relative difference = 3.2588037186351226e-07
+Avg ME (F77/C++)    = 2.0288063388516209
+Relative difference = 3.258803716446205e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.983949e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.371900e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.371900e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.651146e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.006867e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.006867e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.251146 sec
-INFO: No Floating Point Exceptions have been reported
-     6,291,656,449      cycles                           #    2.782 GHz                    
-    11,662,894,163      instructions                     #    1.85  insn per cycle         
-       2.263135736 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2644) (512y:  239) (512z:    0)
+TOTAL       :     2.362181 sec
+     6,370,126,838      cycles                           #    2.692 GHz                       
+    11,708,850,355      instructions                     #    1.84  insn per cycle            
+       2.367368593 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2674) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388516204
-Relative difference = 3.2588037186351226e-07
+Avg ME (F77/C++)    = 2.0288063388516209
+Relative difference = 3.258803716446205e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.728872e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.941749e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.941749e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.672883e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.874095e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.874095e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.960781 sec
-INFO: No Floating Point Exceptions have been reported
-     5,563,913,804      cycles                           #    1.872 GHz                    
-     9,412,295,126      instructions                     #    1.69  insn per cycle         
-       2.972906161 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2099) (512y:  282) (512z: 1958)
+TOTAL       :     2.962382 sec
+     5,387,973,040      cycles                           #    1.816 GHz                       
+     9,344,687,874      instructions                     #    1.73  insn per cycle            
+       2.967757912 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2107) (512y:  282) (512z: 1954)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388516204
-Relative difference = 3.2588037186351226e-07
+Avg ME (F77/C++)    = 2.0288063388516209
+Relative difference = 3.258803716446205e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
index 3c1647789f..1f895c929f 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:28:57
+DATE: 2025-10-11_16:18:48
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.391002e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.323919e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.976474e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.773620e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.074692e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.456461e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.534058 sec
-INFO: No Floating Point Exceptions have been reported
-     2,225,875,951      cycles                           #    2.883 GHz                    
-     3,143,824,990      instructions                     #    1.41  insn per cycle         
-       0.828954123 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 212
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.534811 sec
+     2,266,123,133      cycles                           #    2.828 GHz                       
+     3,168,944,538      instructions                     #    1.40  insn per cycle            
+       0.857996121 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.586147e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.682611e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.682611e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.506524e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.597769e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.597769e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.200138 sec
-INFO: No Floating Point Exceptions have been reported
-    12,457,576,414      cycles                           #    2.958 GHz                    
-    35,030,140,380      instructions                     #    2.81  insn per cycle         
-       4.211834896 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  430) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.291386 sec
+    12,399,672,738      cycles                           #    2.887 GHz                       
+    35,290,415,137      instructions                     #    2.85  insn per cycle            
+       4.296907910 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  447) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388515649
-Relative difference = 3.258803992249869e-07
+Avg ME (F77/C++)    = 2.0288063388515654
+Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.003695e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.145378e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.145378e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.891328e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.022776e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.022776e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.637171 sec
-INFO: No Floating Point Exceptions have been reported
-    10,771,658,335      cycles                           #    2.953 GHz                    
-    23,459,809,146      instructions                     #    2.18  insn per cycle         
-       3.648522280 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2378) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.735496 sec
+    10,767,908,972      cycles                           #    2.879 GHz                       
+    23,493,099,341      instructions                     #    2.18  insn per cycle            
+       3.741023923 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2365) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388515654
-Relative difference = 3.2588039900609506e-07
+Avg ME (F77/C++)    = 2.0288063388515649
+Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.029039e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.423785e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.423785e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.929407e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.312189e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.312189e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.231082 sec
-INFO: No Floating Point Exceptions have been reported
-     6,224,358,348      cycles                           #    2.777 GHz                    
-    11,980,138,777      instructions                     #    1.92  insn per cycle         
-       2.242426635 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2468) (512y:    0) (512z:    0)
+TOTAL       :     2.235559 sec
+     6,081,264,505      cycles                           #    2.715 GHz                       
+    12,002,246,039      instructions                     #    1.97  insn per cycle            
+       2.240973571 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2491) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388516204
-Relative difference = 3.2588037186351226e-07
+Avg ME (F77/C++)    = 2.0288063388516209
+Relative difference = 3.258803716446205e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.044695e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.439218e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.439218e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.860705e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.225389e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.225389e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.224952 sec
-INFO: No Floating Point Exceptions have been reported
-     6,216,689,838      cycles                           #    2.781 GHz                    
-    11,219,235,507      instructions                     #    1.80  insn per cycle         
-       2.236216110 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2098) (512y:  174) (512z:    0)
+TOTAL       :     2.264729 sec
+     6,145,018,402      cycles                           #    2.708 GHz                       
+    11,235,762,297      instructions                     #    1.83  insn per cycle            
+       2.270329967 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2110) (512y:  174) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388516204
-Relative difference = 3.2588037186351226e-07
+Avg ME (F77/C++)    = 2.0288063388516209
+Relative difference = 3.258803716446205e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.888626e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.118349e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.118349e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.696752e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.901055e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.901055e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.842958 sec
-INFO: No Floating Point Exceptions have been reported
-     5,376,391,405      cycles                           #    1.885 GHz                    
-     9,136,626,879      instructions                     #    1.70  insn per cycle         
-       2.854254782 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1632) (512y:  208) (512z: 1567)
+TOTAL       :     2.944494 sec
+     5,239,165,595      cycles                           #    1.777 GHz                       
+     9,095,766,728      instructions                     #    1.74  insn per cycle            
+       2.949694561 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1638) (512y:  208) (512z: 1583)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388516204
-Relative difference = 3.2588037186351226e-07
+Avg ME (F77/C++)    = 2.0288063388516209
+Relative difference = 3.258803716446205e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..70eb313ac9
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_15:41:21
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.475062e+06    1 256
+3.218486e+06    2 256
+5.903821e+06    4 256
+1.165716e+07    8 256
+2.454885e+07   16 256
+4.527393e+07   32 256
+8.391766e+07   64 256
+1.334550e+08  128 256
+1.552485e+08  256 256
+1.694983e+08  512 256
+1.849571e+08 1024 256
+### GPU: scaling test 32
+1.882231e+05    1  32
+4.016921e+05    2  32
+8.022815e+05    4  32
+1.595811e+06    8  32
+3.056260e+06   16  32
+6.326142e+06   32  32
+1.208794e+07   64  32
+2.463478e+07  128  32
+4.741756e+07  256  32
+9.093281e+07  512  32
+1.150905e+08 1024  32
+1.344888e+08 2048  32
+1.543860e+08 4096  32
+1.683918e+08 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.843216e+05    1 256
+1.897524e+05    2 256
+1.896027e+05    4 256
+### CPU: scaling test 32
+1.666589e+05    1  32
+1.669510e+05    2  32
+1.791277e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.321762e+05    1 256
+4.399797e+05    2 256
+4.577304e+05    4 256
+### CPU: scaling test 32
+4.375351e+05    1  32
+3.779245e+05    2  32
+4.181545e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.280541e+05    1 256
+9.070263e+05    2 256
+9.020254e+05    4 256
+### CPU: scaling test 32
+8.873360e+05    1  32
+9.140769e+05    2  32
+9.224693e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.444090e+05    1 256
+9.480587e+05    2 256
+9.506189e+05    4 256
+### CPU: scaling test 32
+9.250159e+05    1  32
+9.436188e+05    2  32
+9.553023e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.540106e+05    1 256
+6.620410e+05    2 256
+6.781399e+05    4 256
+### CPU: scaling test 32
+5.655809e+05    1  32
+5.425522e+05    2  32
+6.546076e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index eed598e900..29a4ea8877 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:59:31
+DATE: 2025-10-11_15:19:12
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.348925e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.730429e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.847126e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.227728e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.785385e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.924249e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.489368 sec
-INFO: No Floating Point Exceptions have been reported
-     2,066,464,716      cycles                           #    2.888 GHz                    
-     2,966,218,976      instructions                     #    1.44  insn per cycle         
-       0.775358949 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.492304 sec
+     2,118,504,146      cycles                           #    2.819 GHz                       
+     2,963,870,047      instructions                     #    1.40  insn per cycle            
+       0.808747497 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499495945871
+Relative difference = 1.919823708908596e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.920704e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.976809e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.976809e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.880677e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.933319e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.933319e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.564608 sec
-INFO: No Floating Point Exceptions have been reported
-    16,407,008,301      cycles                           #    2.946 GHz                    
-    45,390,324,197      instructions                     #    2.77  insn per cycle         
-       5.572247633 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  591) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.662756 sec
+    16,361,560,744      cycles                           #    2.887 GHz                       
+    45,526,236,392      instructions                     #    2.78  insn per cycle            
+       5.668346367 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  596) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.527362e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.867119e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.867119e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.414646e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.739659e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.739659e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.418751 sec
-INFO: No Floating Point Exceptions have been reported
-     7,148,582,676      cycles                           #    2.947 GHz                    
-    17,841,430,692      instructions                     #    2.50  insn per cycle         
-       2.426747092 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.463879 sec
+     7,092,934,877      cycles                           #    2.874 GHz                       
+    17,852,493,922      instructions                     #    2.52  insn per cycle            
+       2.469325378 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.351940e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.517580e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.517580e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.208525e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.313027e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.313027e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.358362 sec
-INFO: No Floating Point Exceptions have been reported
-     3,812,563,399      cycles                           #    2.792 GHz                    
-     8,312,155,726      instructions                     #    2.18  insn per cycle         
-       1.366469053 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3367) (512y:    0) (512z:    0)
+TOTAL       :     1.365011 sec
+     3,747,283,623      cycles                           #    2.735 GHz                       
+     8,291,354,119      instructions                     #    2.21  insn per cycle            
+       1.370608034 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3366) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.799220e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.010674e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.010674e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.454543e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.612605e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.612605e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.295301 sec
-INFO: No Floating Point Exceptions have been reported
-     3,622,174,398      cycles                           #    2.781 GHz                    
-     7,961,498,247      instructions                     #    2.20  insn per cycle         
-       1.303182368 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3209) (512y:   20) (512z:    0)
+TOTAL       :     1.327433 sec
+     3,648,803,599      cycles                           #    2.739 GHz                       
+     8,020,246,707      instructions                     #    2.20  insn per cycle            
+       1.332943592 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3267) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.500324e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.161825e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.161825e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.298741e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.918817e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.918817e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.717843 sec
-INFO: No Floating Point Exceptions have been reported
-     3,332,199,340      cycles                           #    1.933 GHz                    
-     6,146,454,565      instructions                     #    1.84  insn per cycle         
-       1.725889754 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2251) (512y:   22) (512z: 2155)
+TOTAL       :     1.753154 sec
+     3,282,016,345      cycles                           #    1.867 GHz                       
+     6,088,962,733      instructions                     #    1.86  insn per cycle            
+       1.758605907 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2255) (512y:    0) (512z: 2151)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288183148950338
 Relative difference = 1.5521108056421764e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..d76cec9169
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_15:56:13
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+4.541979e+05    1 256
+9.203949e+05    2 256
+1.645855e+06    4 256
+3.099419e+06    8 256
+4.823113e+06   16 256
+7.898172e+06   32 256
+1.061455e+07   64 256
+1.233940e+07  128 256
+1.359197e+07  256 256
+1.426011e+07  512 256
+1.471228e+07 1024 256
+### GPU: scaling test 32
+5.695876e+04    1  32
+1.092163e+05    2  32
+2.189134e+05    4  32
+4.543656e+05    8  32
+8.666538e+05   16  32
+1.664792e+06   32  32
+3.023066e+06   64  32
+5.156183e+06  128  32
+7.621691e+06  256  32
+1.049897e+07  512  32
+1.232012e+07 1024  32
+1.355710e+07 2048  32
+1.432425e+07 4096  32
+1.475276e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.747944e+05    1 256
+1.817829e+05    2 256
+1.896771e+05    4 256
+### CPU: scaling test 32
+1.728805e+05    1  32
+1.767946e+05    2  32
+1.762418e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.997246e+05    1 256
+4.307310e+05    2 256
+4.464263e+05    4 256
+### CPU: scaling test 32
+3.999600e+05    1  32
+3.699679e+05    2  32
+4.315766e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.797794e+05    1 256
+8.305580e+05    2 256
+8.419045e+05    4 256
+### CPU: scaling test 32
+8.881488e+05    1  32
+9.130727e+05    2  32
+9.232345e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.581879e+05    1 256
+9.512415e+05    2 256
+9.501003e+05    4 256
+### CPU: scaling test 32
+9.220574e+05    1  32
+9.420354e+05    2  32
+8.881180e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.495302e+05    1 256
+6.782481e+05    2 256
+6.868630e+05    4 256
+### CPU: scaling test 32
+5.595188e+05    1  32
+6.234779e+05    2  32
+6.548319e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..e92eb3813b
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_15:51:48
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.351930e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.489593e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.498993e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
+TOTAL       :     1.246737 sec
+     4,579,068,239      cycles                           #    2.831 GHz                       
+     6,336,239,576      instructions                     #    1.38  insn per cycle            
+       1.674994938 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028811e+00
+Avg ME (F77/GPU)   = 2.0288499532034621
+Relative difference = 1.920001590188648e-05
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.876691e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.929278e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.929278e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
+TOTAL       :     5.673971 sec
+    16,357,814,340      cycles                           #    2.881 GHz                       
+    45,526,139,472      instructions                     #    2.78  insn per cycle            
+       5.679332523 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  596) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028820e+00
+Avg ME (F77/C++)    = 2.0288198669441044
+Relative difference = 6.558289825352968e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 4.428670e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.753669e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.753669e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
+TOTAL       :     2.455440 sec
+     7,090,910,684      cycles                           #    2.883 GHz                       
+    17,852,546,600      instructions                     #    2.52  insn per cycle            
+       2.460806632 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028819e+00
+Avg ME (F77/C++)    = 2.0288193075684831
+Relative difference = 1.515997647531052e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.063338e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.125894e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.125894e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.386534 sec
+     3,756,179,949      cycles                           #    2.700 GHz                       
+     8,291,185,200      instructions                     #    2.21  insn per cycle            
+       1.391900760 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3366) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288181869545951
+Relative difference = 9.214951531400725e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.396585e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.545366e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.545366e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.336868 sec
+     3,642,317,678      cycles                           #    2.716 GHz                       
+     8,019,205,916      instructions                     #    2.20  insn per cycle            
+       1.344058514 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3267) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288181869545951
+Relative difference = 9.214951531400725e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.310834e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.934764e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.934764e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.748608 sec
+     3,284,552,833      cycles                           #    1.874 GHz                       
+     6,088,622,803      instructions                     #    1.85  insn per cycle            
+       1.753990283 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2255) (512y:    0) (512z: 2151)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288183148950338
+Relative difference = 1.5521108056421764e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
index ba391daf9b..3e1eb5adfb 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,252 +10,216 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:38:02
+DATE: 2025-10-11_16:29:11
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.962971e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.366502e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.366502e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.961069e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.550509e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.550509e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.683449 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,639,955,466      cycles                           #    2.881 GHz                    
-     4,089,465,491      instructions                     #    1.55  insn per cycle         
-       0.973820402 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
+TOTAL       :     0.685895 sec
+     2,724,461,027      cycles                           #    2.849 GHz                       
+     4,115,491,673      instructions                     #    1.51  insn per cycle            
+       1.013379386 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499495945871
+Relative difference = 1.919823708908596e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.921187e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.975107e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.975107e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.879765e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.932625e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.932625e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.590872 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    16,505,827,538      cycles                           #    2.949 GHz                    
-    45,383,324,587      instructions                     #    2.75  insn per cycle         
-       5.597525299 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  591) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.709270 sec
+    16,545,315,698      cycles                           #    2.895 GHz                       
+    45,565,469,143      instructions                     #    2.75  insn per cycle            
+       5.715931822 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  596) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.503675e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.835801e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.835801e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.377287e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.696132e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.696132e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.463825 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     7,301,370,898      cycles                           #    2.956 GHz                    
-    18,072,803,019      instructions                     #    2.48  insn per cycle         
-       2.471007950 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.532029 sec
+     7,290,698,661      cycles                           #    2.873 GHz                       
+    18,128,482,182      instructions                     #    2.49  insn per cycle            
+       2.538964767 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.228346e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.356902e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.356902e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.010327e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.072284e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.072284e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.409585 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,950,274,134      cycles                           #    2.790 GHz                    
-     8,500,615,795      instructions                     #    2.15  insn per cycle         
-       1.416669722 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3367) (512y:    0) (512z:    0)
+TOTAL       :     1.445098 sec
+     3,968,422,684      cycles                           #    2.734 GHz                       
+     8,524,408,845      instructions                     #    2.15  insn per cycle            
+       1.452187655 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3366) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.630316e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.908478e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.908478e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.285117e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.425187e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.425187e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.350838 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,776,468,219      cycles                           #    2.783 GHz                    
-     8,150,432,975      instructions                     #    2.16  insn per cycle         
-       1.357973048 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3209) (512y:   20) (512z:    0)
+TOTAL       :     1.403001 sec
+     3,860,651,396      cycles                           #    2.740 GHz                       
+     8,252,993,133      instructions                     #    2.14  insn per cycle            
+       1.409829697 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3267) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.446924e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.088794e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.088794e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.256834e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.869079e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.869079e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.766906 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,483,580,907      cycles                           #    1.964 GHz                    
-     6,352,443,418      instructions                     #    1.82  insn per cycle         
-       1.774118995 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2251) (512y:   22) (512z: 2155)
+TOTAL       :     1.813530 sec
+     3,488,089,376      cycles                           #    1.917 GHz                       
+     6,339,016,347      instructions                     #    1.82  insn per cycle            
+       1.820470769 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2255) (512y:    0) (512z: 2151)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288183148950338
 Relative difference = 1.5521108056421764e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
index eaf1557b5a..001fd1b5e8 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:50:12
+DATE: 2025-10-11_16:44:30
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.125576e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.707303e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.828418e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.384623e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.781787e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.923075e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079446e+00 +- 3.403306e-03 )  GeV^0
-TOTAL       :     0.579716 sec
-INFO: No Floating Point Exceptions have been reported
-     2,336,853,883      cycles                           #    2.860 GHz                    
-     3,355,823,518      instructions                     #    1.44  insn per cycle         
-       0.873538557 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.586690 sec
+     2,388,718,169      cycles                           #    2.838 GHz                       
+     3,423,003,931      instructions                     #    1.43  insn per cycle            
+       0.899326702 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499495945871
+Relative difference = 1.919823708908596e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.929027e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.983438e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.983438e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.880714e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.934194e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.934194e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079573e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     5.578220 sec
-INFO: No Floating Point Exceptions have been reported
-    16,412,792,219      cycles                           #    2.940 GHz                    
-    45,364,108,775      instructions                     #    2.76  insn per cycle         
-       5.583854256 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  591) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.720004 sec
+    16,536,660,388      cycles                           #    2.889 GHz                       
+    45,556,960,525      instructions                     #    2.75  insn per cycle            
+       5.725324950 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  596) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.528116e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.863028e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.863028e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.433465e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.759989e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.759989e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079572e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     2.458830 sec
-INFO: No Floating Point Exceptions have been reported
-     7,256,357,914      cycles                           #    2.945 GHz                    
-    17,803,442,746      instructions                     #    2.45  insn per cycle         
-       2.464565338 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.509292 sec
+     7,256,957,374      cycles                           #    2.887 GHz                       
+    17,864,987,256      instructions                     #    2.46  insn per cycle            
+       2.514536012 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.321630e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.466483e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.466483e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.020309e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.092138e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.092138e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.402951 sec
-INFO: No Floating Point Exceptions have been reported
-     3,915,341,003      cycles                           #    2.781 GHz                    
-     8,245,891,296      instructions                     #    2.11  insn per cycle         
-       1.408611815 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3367) (512y:    0) (512z:    0)
+TOTAL       :     1.453461 sec
+     3,918,315,703      cycles                           #    2.689 GHz                       
+     8,275,994,533      instructions                     #    2.11  insn per cycle            
+       1.458689528 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3366) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.769699e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.005525e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.005525e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.428992e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.604343e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.604343e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.339268 sec
-INFO: No Floating Point Exceptions have been reported
-     3,730,447,512      cycles                           #    2.775 GHz                    
-     7,861,984,465      instructions                     #    2.11  insn per cycle         
-       1.344998375 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3209) (512y:   20) (512z:    0)
+TOTAL       :     1.389726 sec
+     3,813,398,977      cycles                           #    2.735 GHz                       
+     7,970,393,641      instructions                     #    2.09  insn per cycle            
+       1.395086187 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3267) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.517692e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.188107e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.188107e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.306240e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.928204e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.928204e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404208e-03 )  GeV^0
-TOTAL       :     1.753383 sec
-INFO: No Floating Point Exceptions have been reported
-     3,445,483,739      cycles                           #    1.959 GHz                    
-     6,046,658,237      instructions                     #    1.75  insn per cycle         
-       1.759146158 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2251) (512y:   22) (512z: 2155)
+TOTAL       :     1.809723 sec
+     3,457,472,821      cycles                           #    1.906 GHz                       
+     6,039,803,289      instructions                     #    1.75  insn per cycle            
+       1.815214301 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2255) (512y:    0) (512z: 2151)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288183148950338
 Relative difference = 1.5521108056421764e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
index 0132142a7f..d6dd5599d5 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:47:21
+DATE: 2025-10-11_16:40:59
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.231900e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.718618e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.843172e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.173088e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.784679e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.922376e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.520956 sec
-INFO: No Floating Point Exceptions have been reported
-     2,145,908,279      cycles                           #    2.880 GHz                    
-     3,342,720,192      instructions                     #    1.56  insn per cycle         
-       0.802555619 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.528664 sec
+     2,228,192,580      cycles                           #    2.835 GHz                       
+     3,376,529,061      instructions                     #    1.52  insn per cycle            
+       0.842332325 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499495945871
+Relative difference = 1.919823708908596e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.929666e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.983661e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.983661e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.871432e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.923569e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.923569e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.518695 sec
-INFO: No Floating Point Exceptions have been reported
-    16,237,309,072      cycles                           #    2.940 GHz                    
-    45,332,194,999      instructions                     #    2.79  insn per cycle         
-       5.524338903 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  591) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.691172 sec
+    16,369,213,744      cycles                           #    2.874 GHz                       
+    45,526,750,504      instructions                     #    2.78  insn per cycle            
+       5.696402221 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  596) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.531812e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.871745e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.871745e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.441693e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.769480e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.769480e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.400101 sec
-INFO: No Floating Point Exceptions have been reported
-     7,092,917,063      cycles                           #    2.949 GHz                    
-    17,790,950,300      instructions                     #    2.51  insn per cycle         
-       2.405895056 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.448890 sec
+     7,093,051,214      cycles                           #    2.891 GHz                       
+    17,852,960,067      instructions                     #    2.52  insn per cycle            
+       2.454461827 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.364764e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.520513e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.520513e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.163467e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.249025e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.249025e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.339869 sec
-INFO: No Floating Point Exceptions have been reported
-     3,746,789,760      cycles                           #    2.786 GHz                    
-     8,261,610,745      instructions                     #    2.20  insn per cycle         
-       1.345882215 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3367) (512y:    0) (512z:    0)
+TOTAL       :     1.371747 sec
+     3,753,987,891      cycles                           #    2.728 GHz                       
+     8,291,362,993      instructions                     #    2.21  insn per cycle            
+       1.377043835 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3366) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.818621e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.013746e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.013746e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.404785e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.570601e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.570601e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.275913 sec
-INFO: No Floating Point Exceptions have been reported
-     3,561,649,230      cycles                           #    2.781 GHz                    
-     7,911,264,889      instructions                     #    2.22  insn per cycle         
-       1.281614236 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3209) (512y:   20) (512z:    0)
+TOTAL       :     1.335938 sec
+     3,649,997,495      cycles                           #    2.722 GHz                       
+     8,019,382,433      instructions                     #    2.20  insn per cycle            
+       1.341456805 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3267) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.490214e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.139560e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.139560e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.228574e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.840288e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.840288e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.701983 sec
-INFO: No Floating Point Exceptions have been reported
-     3,270,370,699      cycles                           #    1.916 GHz                    
-     6,096,029,839      instructions                     #    1.86  insn per cycle         
-       1.707817189 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2251) (512y:   22) (512z: 2155)
+TOTAL       :     1.772330 sec
+     3,277,054,131      cycles                           #    1.844 GHz                       
+     6,089,082,639      instructions                     #    1.86  insn per cycle            
+       1.777760056 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2255) (512y:    0) (512z: 2151)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288183148950338
 Relative difference = 1.5521108056421764e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..0ad3efbc84
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_16:50:09
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 8.507701e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.798145e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.925897e+08                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
+TOTAL       :     0.495248 sec
+     2,073,360,534      cycles                           #    2.817 GHz                       
+     2,919,069,837      instructions                     #    1.41  insn per cycle            
+       0.794188547 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028811e+00
+Avg ME (F77/GPU)   = 2.0288499495945871
+Relative difference = 1.919823708908596e-05
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.871656e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.924156e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.924156e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
+TOTAL       :     5.690466 sec
+    16,392,687,892      cycles                           #    2.879 GHz                       
+    45,529,529,055      instructions                     #    2.78  insn per cycle            
+       5.695668537 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  596) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028820e+00
+Avg ME (F77/C++)    = 2.0288198669441044
+Relative difference = 6.558289825352968e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 4.439601e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.767131e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.767131e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
+TOTAL       :     2.449797 sec
+     7,091,941,326      cycles                           #    2.890 GHz                       
+    17,852,858,856      instructions                     #    2.52  insn per cycle            
+       2.455296966 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028819e+00
+Avg ME (F77/C++)    = 2.0288193075684831
+Relative difference = 1.515997647531052e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.145431e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.245108e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.245108e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.374709 sec
+     3,766,055,040      cycles                           #    2.731 GHz                       
+     8,291,749,848      instructions                     #    2.20  insn per cycle            
+       1.380351643 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3366) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288181869545951
+Relative difference = 9.214951531400725e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.422664e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.588896e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.588896e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.332190 sec
+     3,646,916,248      cycles                           #    2.728 GHz                       
+     8,019,155,847      instructions                     #    2.20  insn per cycle            
+       1.337783089 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3267) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288181869545951
+Relative difference = 9.214951531400725e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.310342e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.933915e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.933915e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.749833 sec
+     3,289,282,662      cycles                           #    1.875 GHz                       
+     6,089,226,401      instructions                     #    1.85  insn per cycle            
+       1.755424623 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2255) (512y:    0) (512z: 2151)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288183148950338
+Relative difference = 1.5521108056421764e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
index 55c92f68ec..0d4e6e9f4e 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,235 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:44:35
+DATE: 2025-10-11_16:37:35
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.418560e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.722658e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.839243e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.371325e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.785294e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.923320e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.630221 sec
-INFO: No Floating Point Exceptions have been reported
-     2,475,236,721      cycles                           #    2.897 GHz                    
-     3,823,734,565      instructions                     #    1.54  insn per cycle         
-       0.911361538 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
+TOTAL       :     0.635131 sec
+     2,535,737,467      cycles                           #    2.824 GHz                       
+     3,842,575,439      instructions                     #    1.52  insn per cycle            
+       0.954476643 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499495945871
+Relative difference = 1.919823708908596e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.933112e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.987540e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.987540e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.876671e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.930263e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.930263e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.510619 sec
-INFO: No Floating Point Exceptions have been reported
-    16,239,692,933      cycles                           #    2.945 GHz                    
-    45,332,021,728      instructions                     #    2.79  insn per cycle         
-       5.516250908 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  591) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.674874 sec
+    16,371,341,972      cycles                           #    2.883 GHz                       
+    45,526,097,275      instructions                     #    2.78  insn per cycle            
+       5.680145436 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  596) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.528380e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.868469e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.868469e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.409852e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.733764e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.733764e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.402631 sec
-INFO: No Floating Point Exceptions have been reported
-     7,087,618,340      cycles                           #    2.944 GHz                    
-    17,790,727,043      instructions                     #    2.51  insn per cycle         
-       2.408346877 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.465466 sec
+     7,089,429,077      cycles                           #    2.870 GHz                       
+    17,852,779,482      instructions                     #    2.52  insn per cycle            
+       2.470998970 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.367783e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.536121e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.536121e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.159709e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.263116e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.263116e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.339197 sec
-INFO: No Floating Point Exceptions have been reported
-     3,748,433,186      cycles                           #    2.789 GHz                    
-     8,262,218,774      instructions                     #    2.20  insn per cycle         
-       1.344812605 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3367) (512y:    0) (512z:    0)
+TOTAL       :     1.372303 sec
+     3,755,689,027      cycles                           #    2.728 GHz                       
+     8,291,380,091      instructions                     #    2.21  insn per cycle            
+       1.377787541 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3366) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.816225e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.011910e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.011910e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.407094e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.566877e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.566877e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.274973 sec
-INFO: No Floating Point Exceptions have been reported
-     3,561,414,995      cycles                           #    2.782 GHz                    
-     7,912,015,045      instructions                     #    2.22  insn per cycle         
-       1.280637958 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3209) (512y:   20) (512z:    0)
+TOTAL       :     1.334826 sec
+     3,652,466,006      cycles                           #    2.727 GHz                       
+     8,020,599,017      instructions                     #    2.20  insn per cycle            
+       1.340268045 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3267) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.504790e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.157762e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.157762e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.261859e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.880005e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.880005e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.699252 sec
-INFO: No Floating Point Exceptions have been reported
-     3,270,672,138      cycles                           #    1.919 GHz                    
-     6,095,863,693      instructions                     #    1.86  insn per cycle         
-       1.704973507 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2251) (512y:   22) (512z: 2155)
+TOTAL       :     1.763075 sec
+     3,282,506,046      cycles                           #    1.857 GHz                       
+     6,088,973,421      instructions                     #    1.85  insn per cycle            
+       1.768455658 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2255) (512y:    0) (512z: 2151)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288183148950338
 Relative difference = 1.5521108056421764e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
index 5e80ecf473..e0e7f701d0 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:59:52
+DATE: 2025-10-11_15:19:36
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.326131e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.746336e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.856838e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.162146e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.783523e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.914919e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.493394 sec
-INFO: No Floating Point Exceptions have been reported
-     2,062,281,894      cycles                           #    2.861 GHz                    
-     2,938,913,241      instructions                     #    1.43  insn per cycle         
-       0.784913836 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 126
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.491426 sec
+     2,125,746,364      cycles                           #    2.830 GHz                       
+     2,979,109,571      instructions                     #    1.40  insn per cycle            
+       0.808584273 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499495945871
+Relative difference = 1.919823708908596e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.953822e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.011638e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.011638e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.921360e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.976251e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.976251e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.471139 sec
-INFO: No Floating Point Exceptions have been reported
-    16,020,529,034      cycles                           #    2.925 GHz                    
-    44,492,038,074      instructions                     #    2.78  insn per cycle         
-       5.480388445 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.544826 sec
+    16,047,528,517      cycles                           #    2.892 GHz                       
+    44,602,173,132      instructions                     #    2.78  insn per cycle            
+       5.550245916 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  537) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.317220e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.788673e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.788673e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.214945e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.668104e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.668104e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.075008 sec
-INFO: No Floating Point Exceptions have been reported
-     6,135,177,420      cycles                           #    2.947 GHz                    
-    17,131,917,948      instructions                     #    2.79  insn per cycle         
-       2.082995277 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2863) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.098377 sec
+     6,110,919,161      cycles                           #    2.906 GHz                       
+    17,150,206,958      instructions                     #    2.81  insn per cycle            
+       2.103751937 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2861) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.077036e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.672972e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.672972e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.851382e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.388872e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.388872e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.827961 sec
-INFO: No Floating Point Exceptions have been reported
-     5,098,745,585      cycles                           #    2.778 GHz                    
-    10,277,927,063      instructions                     #    2.02  insn per cycle         
-       1.836088116 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3907) (512y:    0) (512z:    0)
+TOTAL       :     1.879565 sec
+     5,032,467,533      cycles                           #    2.672 GHz                       
+    10,256,120,490      instructions                     #    2.04  insn per cycle            
+       1.885016732 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3911) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.138089e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.753320e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.753320e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.035975e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.607599e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.607599e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.811229 sec
-INFO: No Floating Point Exceptions have been reported
-     5,047,478,028      cycles                           #    2.778 GHz                    
-    10,048,355,032      instructions                     #    1.99  insn per cycle         
-       1.819572790 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3806) (512y:    2) (512z:    0)
+TOTAL       :     1.824491 sec
+     4,977,961,454      cycles                           #    2.721 GHz                       
+    10,027,255,295      instructions                     #    2.01  insn per cycle            
+       1.830117525 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3808) (512y:    2) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.690006e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.022722e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.022722e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.496582e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.807885e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.807885e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     2.339710 sec
-INFO: No Floating Point Exceptions have been reported
-     4,430,484,038      cycles                           #    1.888 GHz                    
-     8,494,687,635      instructions                     #    1.92  insn per cycle         
-       2.347901015 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2746) (512y:    4) (512z: 2754)
+TOTAL       :     2.420813 sec
+     4,388,139,749      cycles                           #    1.809 GHz                       
+     8,457,918,888      instructions                     #    1.93  insn per cycle            
+       2.426523884 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2749) (512y:    4) (512z: 2749)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288183148950338
 Relative difference = 1.5521108056421764e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
index 8666f655aa..f0b80e260e 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:29:20
+DATE: 2025-10-11_16:19:19
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.502979e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.757241e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.878370e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.131628e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.790004e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.927316e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.488684 sec
-INFO: No Floating Point Exceptions have been reported
-     2,072,092,086      cycles                           #    2.888 GHz                    
-     2,980,809,123      instructions                     #    1.44  insn per cycle         
-       0.774128701 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.492105 sec
+     2,126,004,887      cycles                           #    2.830 GHz                       
+     2,972,871,951      instructions                     #    1.40  insn per cycle            
+       0.808125336 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499495945871
+Relative difference = 1.919823708908596e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.497944e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.591831e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.591831e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.361435e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.444812e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.444812e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.301034 sec
-INFO: No Floating Point Exceptions have been reported
-    12,652,758,977      cycles                           #    2.937 GHz                    
-    34,660,886,060      instructions                     #    2.74  insn per cycle         
-       4.309086604 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  683) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.526570 sec
+    12,786,889,749      cycles                           #    2.822 GHz                       
+    34,767,168,341      instructions                     #    2.72  insn per cycle            
+       4.531843724 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  649) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
-Avg ME (F77/C++)    = 2.0288199094356969
-Relative difference = 4.463890496342449e-08
+Avg ME (F77/C++)    = 2.0288198597263545
+Relative difference = 6.914050807267083e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.170038e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.622090e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.622090e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.142214e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.587894e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.587894e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.133248 sec
-INFO: No Floating Point Exceptions have been reported
-     6,307,478,134      cycles                           #    2.947 GHz                    
-    14,873,781,997      instructions                     #    2.36  insn per cycle         
-       2.140857047 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2975) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.126971 sec
+     6,176,687,935      cycles                           #    2.898 GHz                       
+    14,909,588,070      instructions                     #    2.41  insn per cycle            
+       2.132251600 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2978) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
-Avg ME (F77/C++)    = 2.0288193755550310
-Relative difference = 1.8511017053446366e-07
+Avg ME (F77/C++)    = 2.0288193110609427
+Relative difference = 1.5332118970762702e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.248492e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.104502e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.104502e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.053580e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.852260e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.852260e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.548692 sec
-INFO: No Floating Point Exceptions have been reported
-     4,331,332,767      cycles                           #    2.784 GHz                    
-     9,119,017,787      instructions                     #    2.11  insn per cycle         
-       1.556682967 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4456) (512y:    0) (512z:    0)
+TOTAL       :     1.573119 sec
+     4,286,494,919      cycles                           #    2.717 GHz                       
+     9,134,727,561      instructions                     #    2.13  insn per cycle            
+       1.578532938 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4466) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288182069780305
-Relative difference = 1.0201902325125583e-07
+Avg ME (F77/C++)    = 2.0288181575015187
+Relative difference = 7.763215770863579e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.353371e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.251881e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.251881e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.155196e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.974374e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.974374e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.529166 sec
-INFO: No Floating Point Exceptions have been reported
-     4,288,032,705      cycles                           #    2.791 GHz                    
-     8,709,611,506      instructions                     #    2.03  insn per cycle         
-       1.537124060 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4233) (512y:    0) (512z:    0)
+TOTAL       :     1.552673 sec
+     4,257,884,690      cycles                           #    2.734 GHz                       
+     8,700,271,049      instructions                     #    2.04  insn per cycle            
+       1.558196136 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4224) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288182069780305
-Relative difference = 1.0201902325125583e-07
+Avg ME (F77/C++)    = 2.0288181575015187
+Relative difference = 7.763215770863579e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.411255e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.862053e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.862053e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.246960e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.671205e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.671205e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     2.041395 sec
-INFO: No Floating Point Exceptions have been reported
-     3,904,121,018      cycles                           #    1.906 GHz                    
-     7,856,412,999      instructions                     #    2.01  insn per cycle         
-       2.049301951 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4273) (512y:    0) (512z: 2558)
+TOTAL       :     2.085797 sec
+     3,847,204,769      cycles                           #    1.841 GHz                       
+     7,838,410,301      instructions                     #    2.04  insn per cycle            
+       2.091150296 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4276) (512y:    0) (512z: 2561)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288183246739209
-Relative difference = 1.6003107281264138e-07
+Avg ME (F77/C++)    = 2.0288182856747881
+Relative difference = 1.4080848467904676e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
index 74b1cf75ec..26b7d791d0 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:29:40
+DATE: 2025-10-11_16:19:42
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.573239e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.755917e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.881516e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.156027e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.795194e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.935274e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.487451 sec
-INFO: No Floating Point Exceptions have been reported
-     2,067,657,057      cycles                           #    2.894 GHz                    
-     2,969,147,079      instructions                     #    1.44  insn per cycle         
-       0.771604792 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 126
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.491299 sec
+     2,134,224,720      cycles                           #    2.818 GHz                       
+     2,993,931,932      instructions                     #    1.40  insn per cycle            
+       0.814346515 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499495945871
+Relative difference = 1.919823708908596e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.674902e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.781976e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.781976e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.565640e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.664688e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.664688e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.022349 sec
-INFO: No Floating Point Exceptions have been reported
-    11,884,847,246      cycles                           #    2.950 GHz                    
-    35,128,022,846      instructions                     #    2.96  insn per cycle         
-       4.030241157 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  453) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.173683 sec
+    11,879,331,181      cycles                           #    2.844 GHz                       
+    35,236,712,439      instructions                     #    2.97  insn per cycle            
+       4.178908664 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  466) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
-Avg ME (F77/C++)    = 2.0288199094356969
-Relative difference = 4.463890496342449e-08
+Avg ME (F77/C++)    = 2.0288198597263545
+Relative difference = 6.914050807267083e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.473588e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.982990e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.982990e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.266171e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.744141e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.744141e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.018275 sec
-INFO: No Floating Point Exceptions have been reported
-     5,977,087,994      cycles                           #    2.951 GHz                    
-    14,582,659,278      instructions                     #    2.44  insn per cycle         
-       2.026172081 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2569) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.079083 sec
+     5,991,903,430      cycles                           #    2.877 GHz                       
+    14,602,254,330      instructions                     #    2.44  insn per cycle            
+       2.084327795 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2563) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
-Avg ME (F77/C++)    = 2.0288193583255634
-Relative difference = 1.7661780742548925e-07
+Avg ME (F77/C++)    = 2.0288193158339709
+Relative difference = 1.5567380381214021e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.377553e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.279187e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.279187e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.207154e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.042682e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.042682e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.524553 sec
-INFO: No Floating Point Exceptions have been reported
-     4,234,763,555      cycles                           #    2.764 GHz                    
-     8,897,798,804      instructions                     #    2.10  insn per cycle         
-       1.532761317 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3552) (512y:    0) (512z:    0)
+TOTAL       :     1.541810 sec
+     4,186,740,965      cycles                           #    2.708 GHz                       
+     8,926,188,902      instructions                     #    2.13  insn per cycle            
+       1.547085242 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3572) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288182107033208
-Relative difference = 1.0385521077446488e-07
+Avg ME (F77/C++)    = 2.0288181557552889
+Relative difference = 7.677144480713156e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.495273e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.420338e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.420338e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.102028e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.913223e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.913223e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.502506 sec
-INFO: No Floating Point Exceptions have been reported
-     4,214,392,060      cycles                           #    2.792 GHz                    
-     8,461,762,117      instructions                     #    2.01  insn per cycle         
-       1.510417354 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3296) (512y:    0) (512z:    0)
+TOTAL       :     1.563681 sec
+     4,235,267,452      cycles                           #    2.701 GHz                       
+     8,456,560,522      instructions                     #    2.00  insn per cycle            
+       1.569074089 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3298) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288182107033208
-Relative difference = 1.0385521077446488e-07
+Avg ME (F77/C++)    = 2.0288181557552889
+Relative difference = 7.677144480713156e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.487070e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.949626e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.949626e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.304407e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.741587e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.741587e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     2.014420 sec
-INFO: No Floating Point Exceptions have been reported
-     3,856,759,695      cycles                           #    1.908 GHz                    
-     7,749,847,516      instructions                     #    2.01  insn per cycle         
-       2.022398856 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3289) (512y:    0) (512z: 2110)
+TOTAL       :     2.064360 sec
+     3,788,747,014      cycles                           #    1.832 GHz                       
+     7,722,840,376      instructions                     #    2.04  insn per cycle            
+       2.069669389 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3288) (512y:    0) (512z: 2115)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288183204829693
-Relative difference = 1.5796536184903122e-07
+Avg ME (F77/C++)    = 2.0288182756630704
+Relative difference = 1.3587373071042248e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..54ccd09765
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_15:41:00
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.555626e+06    1 256
+2.986119e+06    2 256
+6.036846e+06    4 256
+1.188714e+07    8 256
+2.177797e+07   16 256
+4.206332e+07   32 256
+5.661642e+07   64 256
+6.199098e+07  128 256
+6.763415e+07  256 256
+7.331358e+07  512 256
+7.450922e+07 1024 256
+### GPU: scaling test 32
+1.688262e+05    1  32
+3.674276e+05    2  32
+6.877986e+05    4  32
+1.577034e+06    8  32
+2.900718e+06   16  32
+6.084626e+06   32  32
+1.103805e+07   64  32
+2.304347e+07  128  32
+4.366714e+07  256  32
+5.801104e+07  512  32
+6.280270e+07 1024  32
+6.781899e+07 2048  32
+7.247457e+07 4096  32
+7.443838e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.683557e+05    1 256
+1.766666e+05    2 256
+1.772916e+05    4 256
+### CPU: scaling test 32
+1.624761e+05    1  32
+1.667961e+05    2  32
+1.691810e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.045208e+05    1 256
+3.168070e+05    2 256
+3.217376e+05    4 256
+### CPU: scaling test 32
+2.400438e+05    1  32
+2.988113e+05    2  32
+3.019623e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.679979e+05    1 256
+5.383388e+05    2 256
+5.290511e+05    4 256
+### CPU: scaling test 32
+4.501210e+05    1  32
+5.408786e+05    2  32
+5.212787e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.337937e+05    1 256
+5.659660e+05    2 256
+5.616905e+05    4 256
+### CPU: scaling test 32
+5.554591e+05    1  32
+5.687726e+05    2  32
+5.722998e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.669688e+05    1 256
+3.628236e+05    2 256
+3.574239e+05    4 256
+### CPU: scaling test 32
+3.591712e+05    1  32
+3.436223e+05    2  32
+3.302689e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 46bc87b45e..544d45db6c 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:58:41
+DATE: 2025-10-11_15:18:10
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.456560e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.379988e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.000705e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.769964e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.181272e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.572183e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.534501 sec
-INFO: No Floating Point Exceptions have been reported
-     2,219,584,721      cycles                           #    2.878 GHz                    
-     3,138,987,562      instructions                     #    1.41  insn per cycle         
-       0.829330920 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.539441 sec
+     2,308,666,493      cycles                           #    2.818 GHz                       
+     3,226,425,933      instructions                     #    1.40  insn per cycle            
+       0.876647709 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063423243874
-Relative difference = 3.241686432649386e-07
+Avg ME (F77/GPU)   = 2.0288063984103686
+Relative difference = 2.9652383466921405e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.813220e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.859845e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.859845e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.759806e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.804204e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.804204e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.928691 sec
-INFO: No Floating Point Exceptions have been reported
-    17,514,594,449      cycles                           #    2.949 GHz                    
-    46,201,641,620      instructions                     #    2.64  insn per cycle         
-       5.940965337 seconds time elapsed
+TOTAL       :     6.067261 sec
+    17,454,635,732      cycles                           #    2.875 GHz                       
+    46,423,626,762      instructions                     #    2.66  insn per cycle            
+       6.073054725 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063903750300
 Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.229159e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.395479e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.395479e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.147663e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.305031e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.305031e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.394540 sec
-INFO: No Floating Point Exceptions have been reported
-    10,052,901,757      cycles                           #    2.953 GHz                    
-    27,702,324,481      instructions                     #    2.76  insn per cycle         
-       3.406321535 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.441893 sec
+     9,972,963,833      cycles                           #    2.894 GHz                       
+    27,538,315,448      instructions                     #    2.76  insn per cycle            
+       3.447650533 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063903750300
 Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.062332e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.465524e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.465524e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.024399e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.421447e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.421447e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.217135 sec
-INFO: No Floating Point Exceptions have been reported
-     6,171,509,914      cycles                           #    2.770 GHz                    
-    12,603,170,569      instructions                     #    2.04  insn per cycle         
-       2.229995554 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2773) (512y:    0) (512z:    0)
+TOTAL       :     2.195598 sec
+     6,002,435,023      cycles                           #    2.728 GHz                       
+    12,431,827,184      instructions                     #    2.07  insn per cycle            
+       2.201348309 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2753) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.580384e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.068896e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.068896e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.239682e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.660399e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.660399e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.025421 sec
-INFO: No Floating Point Exceptions have been reported
-     5,651,741,681      cycles                           #    2.776 GHz                    
-    12,038,443,177      instructions                     #    2.13  insn per cycle         
-       2.038138408 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2518) (512y:  146) (512z:    0)
+TOTAL       :     2.110434 sec
+     5,712,484,983      cycles                           #    2.700 GHz                       
+    11,998,977,462      instructions                     #    2.10  insn per cycle            
+       2.116158863 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2553) (512y:  126) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.630973e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.831034e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.831034e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.500878e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.684605e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.684605e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.034632 sec
-INFO: No Floating Point Exceptions have been reported
-     5,740,712,408      cycles                           #    1.885 GHz                    
-     8,225,599,297      instructions                     #    1.43  insn per cycle         
-       3.047056631 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1671) (512y:  126) (512z: 1862)
+TOTAL       :     3.104242 sec
+     5,600,150,554      cycles                           #    1.801 GHz                       
+     7,978,262,251      instructions                     #    1.42  insn per cycle            
+       3.109987032 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1645) (512y:  104) (512z: 1823)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..108784d281
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_15:55:32
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+3.842927e+05    1 256
+7.220512e+05    2 256
+1.491222e+06    4 256
+2.667848e+06    8 256
+4.492588e+06   16 256
+7.139826e+06   32 256
+9.157999e+06   64 256
+1.073484e+07  128 256
+1.179428e+07  256 256
+1.249669e+07  512 256
+1.288538e+07 1024 256
+### GPU: scaling test 32
+4.771078e+04    1  32
+9.904224e+04    2  32
+1.834573e+05    4  32
+3.665684e+05    8  32
+7.223823e+05   16  32
+1.469468e+06   32  32
+2.777699e+06   64  32
+4.610551e+06  128  32
+7.035262e+06  256  32
+9.216118e+06  512  32
+1.072571e+07 1024  32
+1.171381e+07 2048  32
+1.244431e+07 4096  32
+1.273882e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.731213e+05    1 256
+1.728516e+05    2 256
+1.721045e+05    4 256
+### CPU: scaling test 32
+1.615729e+05    1  32
+1.697199e+05    2  32
+1.614079e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.020824e+05    1 256
+3.069129e+05    2 256
+3.229135e+05    4 256
+### CPU: scaling test 32
+3.068132e+05    1  32
+3.048781e+05    2  32
+3.056454e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.343999e+05    1 256
+5.367208e+05    2 256
+5.297172e+05    4 256
+### CPU: scaling test 32
+5.308120e+05    1  32
+5.388158e+05    2  32
+5.419802e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.825073e+05    1 256
+5.664394e+05    2 256
+5.715909e+05    4 256
+### CPU: scaling test 32
+5.596656e+05    1  32
+5.686160e+05    2  32
+5.559851e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.589260e+05    1 256
+3.525435e+05    2 256
+3.573650e+05    4 256
+### CPU: scaling test 32
+3.610027e+05    1  32
+3.443008e+05    2  32
+3.569646e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..7312e696ce
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_15:51:10
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.104417e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.285432e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.297689e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     1.279377 sec
+     4,758,540,406      cycles                           #    2.854 GHz                       
+     6,643,646,071      instructions                     #    1.40  insn per cycle            
+       1.727175074 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028807e+00
+Avg ME (F77/GPU)   = 2.0288064033535846
+Relative difference = 2.940873209649997e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.760176e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.804148e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.804148e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     6.064955 sec
+    17,456,010,031      cycles                           #    2.876 GHz                       
+    46,423,917,890      instructions                     #    2.66  insn per cycle            
+       6.070556221 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063903750300
+Relative difference = 3.0048445715164216e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.112364e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.267713e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.267713e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.477891 sec
+     9,968,942,008      cycles                           #    2.863 GHz                       
+    27,538,128,939      instructions                     #    2.76  insn per cycle            
+       3.483544020 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063903750300
+Relative difference = 3.0048445715164216e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 5.028981e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.424760e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.424760e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.192400 sec
+     5,973,164,521      cycles                           #    2.719 GHz                       
+    12,431,134,039      instructions                     #    2.08  insn per cycle            
+       2.197968192 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2753) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 5.257840e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.686842e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.686842e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.101990 sec
+     5,696,565,349      cycles                           #    2.704 GHz                       
+    11,998,610,945      instructions                     #    2.11  insn per cycle            
+       2.107441314 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2553) (512y:  126) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.469903e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.652910e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.652910e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.130516 sec
+     5,582,204,405      cycles                           #    1.781 GHz                       
+     7,977,597,583      instructions                     #    1.43  insn per cycle            
+       3.135909354 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1645) (512y:  104) (512z: 1823)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..a27304f7a2
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-10-11_16:49:40
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 3.756606e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.155088e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.561577e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     0.537651 sec
+     2,186,941,067      cycles                           #    2.809 GHz                       
+     3,125,534,216      instructions                     #    1.43  insn per cycle            
+       0.834390897 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028807e+00
+Avg ME (F77/GPU)   = 2.0288063984103686
+Relative difference = 2.9652383466921405e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.767944e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.812249e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.812249e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     6.039437 sec
+    17,472,986,286      cycles                           #    2.891 GHz                       
+    46,424,951,460      instructions                     #    2.66  insn per cycle            
+       6.045113130 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063903750300
+Relative difference = 3.0048445715164216e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.115406e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.269058e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.269058e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.475319 sec
+     9,963,493,199      cycles                           #    2.863 GHz                       
+    27,538,476,105      instructions                     #    2.76  insn per cycle            
+       3.481071152 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063903750300
+Relative difference = 3.0048445715164216e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 4.946610e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.336487e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.336487e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.229478 sec
+     5,990,602,521      cycles                           #    2.681 GHz                       
+    12,432,421,413      instructions                     #    2.08  insn per cycle            
+       2.235415428 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2753) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 5.285571e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.719782e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.719782e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.092266 sec
+     5,708,527,225      cycles                           #    2.722 GHz                       
+    11,999,256,931      instructions                     #    2.10  insn per cycle            
+       2.098089382 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2553) (512y:  126) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.527493e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.713588e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.713588e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.081621 sec
+     5,593,729,597      cycles                           #    1.813 GHz                       
+     7,978,349,260      instructions                     #    1.43  insn per cycle            
+       3.087480023 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1645) (512y:  104) (512z: 1823)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
index ffa5410982..1465355626 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:59:06
+DATE: 2025-10-11_15:18:40
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.422071e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.351796e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.985674e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.777084e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.077254e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.446466e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.535641 sec
-INFO: No Floating Point Exceptions have been reported
-     2,214,747,611      cycles                           #    2.879 GHz                    
-     3,172,033,471      instructions                     #    1.43  insn per cycle         
-       0.829540839 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 212
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.540754 sec
+     2,303,579,994      cycles                           #    2.845 GHz                       
+     3,194,596,199      instructions                     #    1.39  insn per cycle            
+       0.867263238 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063423243874
-Relative difference = 3.241686432649386e-07
+Avg ME (F77/GPU)   = 2.0288063984103686
+Relative difference = 2.9652383466921405e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.862163e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.911340e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.911340e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.824688e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.871754e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.871754e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.777703 sec
-INFO: No Floating Point Exceptions have been reported
-    17,097,861,095      cycles                           #    2.954 GHz                    
-    45,230,787,591      instructions                     #    2.65  insn per cycle         
-       5.789414615 seconds time elapsed
+TOTAL       :     5.855357 sec
+    17,037,217,478      cycles                           #    2.907 GHz                       
+    45,397,533,623      instructions                     #    2.66  insn per cycle            
+       5.861206077 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  568) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063903750300
 Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.356972e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.536408e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.536408e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.237044e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.404010e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.404010e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.270231 sec
-INFO: No Floating Point Exceptions have been reported
-     9,665,855,757      cycles                           #    2.946 GHz                    
-    26,370,377,514      instructions                     #    2.73  insn per cycle         
-       3.281726897 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2386) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.349468 sec
+     9,646,439,674      cycles                           #    2.877 GHz                       
+    26,137,505,372      instructions                     #    2.71  insn per cycle            
+       3.359990731 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2348) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063903750300
 Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.515319e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.832036e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.832036e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.466137e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.774981e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.774981e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.470441 sec
-INFO: No Floating Point Exceptions have been reported
-     6,884,599,220      cycles                           #    2.774 GHz                    
-    14,150,233,239      instructions                     #    2.06  insn per cycle         
-       2.482504065 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2896) (512y:    0) (512z:    0)
+TOTAL       :     2.456437 sec
+     6,697,050,662      cycles                           #    2.721 GHz                       
+    13,944,204,689      instructions                     #    2.08  insn per cycle            
+       2.462051029 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2872) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.744762e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.096792e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.096792e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.691262e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.027361e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.027361e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.356796 sec
-INFO: No Floating Point Exceptions have been reported
-     6,551,408,744      cycles                           #    2.767 GHz                    
-    13,642,717,150      instructions                     #    2.08  insn per cycle         
-       2.368190066 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2535) (512y:  302) (512z:    0)
+TOTAL       :     2.343988 sec
+     6,390,605,834      cycles                           #    2.721 GHz                       
+    13,479,985,492      instructions                     #    2.11  insn per cycle            
+       2.349738024 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2521) (512y:  302) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.568399e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.763148e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.763148e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.551855e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.739422e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.739422e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.086761 sec
-INFO: No Floating Point Exceptions have been reported
-     5,741,113,391      cycles                           #    1.854 GHz                    
-     9,326,512,235      instructions                     #    1.62  insn per cycle         
-       3.098253222 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1456) (512y:  212) (512z: 2060)
+TOTAL       :     3.060308 sec
+     5,571,902,780      cycles                           #    1.818 GHz                       
+     9,121,747,396      instructions                     #    1.64  insn per cycle            
+       3.066113600 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1425) (512y:  212) (512z: 2028)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..13f478253e
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+DATE: 2025-10-11_15:41:41
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+9.342009e+05    1 256
+1.901727e+06    2 256
+3.513575e+06    4 256
+6.551587e+06    8 256
+9.027157e+06   16 256
+1.070472e+07   32 256
+1.211534e+07   64 256
+1.306873e+07  128 256
+1.345611e+07  256 256
+1.354148e+07  512 256
+1.365009e+07 1024 256
+### GPU: scaling test 32
+1.205755e+05    1  32
+2.514606e+05    2  32
+5.001172e+05    4  32
+9.511001e+05    8  32
+1.851142e+06   16  32
+3.545547e+06   32  32
+6.694933e+06   64  32
+9.515800e+06  128  32
+1.033055e+07  256  32
+1.109138e+07  512  32
+1.156765e+07 1024  32
+1.192504e+07 2048  32
+1.207986e+07 4096  32
+1.213861e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.335000e+04    1 256
+2.360867e+04    2 256
+2.368335e+04    4 256
+### CPU: scaling test 32
+2.236539e+04    1  32
+2.311725e+04    2  32
+2.306838e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.370978e+04    1 256
+4.405634e+04    2 256
+4.456211e+04    4 256
+### CPU: scaling test 32
+3.836659e+04    1  32
+4.179709e+04    2  32
+4.369754e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.926025e+04    1 256
+8.558488e+04    2 256
+8.539748e+04    4 256
+### CPU: scaling test 32
+8.398708e+04    1  32
+8.906950e+04    2  32
+8.745810e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.556008e+04    1 256
+9.646045e+04    2 256
+9.528700e+04    4 256
+### CPU: scaling test 32
+8.322886e+04    1  32
+8.916295e+04    2  32
+9.000274e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.425669e+04    1 256
+6.732158e+04    2 256
+6.696446e+04    4 256
+### CPU: scaling test 32
+6.780265e+04    1  32
+6.786649e+04    2  32
+6.753983e+04    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index 028292e268..53423221d6 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:00:14
+DATE: 2025-10-11_15:20:08
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.612194e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.849217e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.964394e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.590985e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.195514e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.215933e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.477846 sec
-INFO: No Floating Point Exceptions have been reported
-     1,998,983,760      cycles                           #    2.871 GHz                    
-     2,812,176,587      instructions                     #    1.41  insn per cycle         
-       0.759674168 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.475543 sec
+     2,072,965,387      cycles                           #    2.836 GHz                       
+     2,812,513,904      instructions                     #    1.36  insn per cycle            
+       0.789686961 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.042987e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.232338e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.242858e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.134307e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.362144e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.374708e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.621402 sec
-INFO: No Floating Point Exceptions have been reported
-     2,510,286,495      cycles                           #    2.883 GHz                    
-     3,752,986,245      instructions                     #    1.50  insn per cycle         
-       0.931747637 seconds time elapsed
+TOTAL       :     0.566501 sec
+     2,402,738,046      cycles                           #    2.849 GHz                       
+     3,415,144,104      instructions                     #    1.42  insn per cycle            
+       0.902303425 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.413122e+00
-Avg ME (F77/GPU)   = 1.4131213684418649
-Relative difference = 4.469239988637851e-07
+Avg ME (F77/GPU)   = 1.4131213684418646
+Relative difference = 4.4692399902091566e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.434605e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.446812e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.446812e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.360536e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.372172e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.372172e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.752117 sec
-INFO: No Floating Point Exceptions have been reported
-    19,916,103,310      cycles                           #    2.949 GHz                    
-    59,916,518,373      instructions                     #    3.01  insn per cycle         
-       6.756066066 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1199) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.962552 sec
+    20,052,897,229      cycles                           #    2.879 GHz                       
+    60,517,484,268      instructions                     #    3.02  insn per cycle            
+       6.966626285 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684432433
 Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.568526e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.611480e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.611480e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.457200e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.498681e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.498681e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.606956 sec
-INFO: No Floating Point Exceptions have been reported
-    10,571,212,167      cycles                           #    2.928 GHz                    
-    31,086,653,440      instructions                     #    2.94  insn per cycle         
-       3.611892241 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5221) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.696167 sec
+    10,707,329,548      cycles                           #    2.895 GHz                       
+    31,170,881,652      instructions                     #    2.91  insn per cycle            
+       3.700212507 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684432433
 Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.091675e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.256165e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.256165e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.870920e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.029877e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.029877e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.823998 sec
-INFO: No Floating Point Exceptions have been reported
-     4,999,238,647      cycles                           #    2.738 GHz                    
-    11,406,827,724      instructions                     #    2.28  insn per cycle         
-       1.827985092 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4642) (512y:    0) (512z:    0)
+TOTAL       :     1.867542 sec
+     5,077,134,246      cycles                           #    2.714 GHz                       
+    11,510,163,524      instructions                     #    2.27  insn per cycle            
+       1.871736808 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4658) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.026950e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.047965e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.047965e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.650179e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.846221e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.846221e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.617207 sec
-INFO: No Floating Point Exceptions have been reported
-     4,447,500,259      cycles                           #    2.747 GHz                    
-    10,665,398,274      instructions                     #    2.40  insn per cycle         
-       1.621167175 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4378) (512y:   92) (512z:    0)
+TOTAL       :     1.718355 sec
+     4,666,627,650      cycles                           #    2.711 GHz                       
+    10,813,430,115      instructions                     #    2.32  insn per cycle            
+       1.722417533 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4482) (512y:   57) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.168386e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.273905e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.273905e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.895380e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.991775e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.991775e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.309115 sec
-INFO: No Floating Point Exceptions have been reported
-     4,128,751,307      cycles                           #    1.785 GHz                    
-     5,972,449,468      instructions                     #    1.45  insn per cycle         
-       2.314144205 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1620) (512y:   94) (512z: 3577)
+TOTAL       :     2.398459 sec
+     4,202,110,606      cycles                           #    1.750 GHz                       
+     6,028,015,369      instructions                     #    1.43  insn per cycle            
+       2.402798408 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1720) (512y:   63) (512z: 3552)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416484
 Relative difference = 4.469241520660492e-07
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..88f80f3081
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+DATE: 2025-10-11_15:56:53
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+3.480668e+05    1 256
+6.757720e+05    2 256
+1.342710e+06    4 256
+1.961408e+06    8 256
+2.863939e+06   16 256
+3.692840e+06   32 256
+4.108363e+06   64 256
+4.389055e+06  128 256
+4.590159e+06  256 256
+4.677980e+06  512 256
+4.719776e+06 1024 256
+### GPU: scaling test 32
+5.093214e+04    1  32
+9.453332e+04    2  32
+1.923664e+05    4  32
+3.828673e+05    8  32
+7.100352e+05   16  32
+1.286052e+06   32  32
+2.074968e+06   64  32
+2.993421e+06  128  32
+3.590529e+06  256  32
+4.025040e+06  512  32
+4.233186e+06 1024  32
+4.428606e+06 2048  32
+4.494795e+06 4096  32
+4.506986e+06 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.283518e+04    1 256
+2.360000e+04    2 256
+2.368362e+04    4 256
+### CPU: scaling test 32
+2.195483e+04    1  32
+2.267087e+04    2  32
+2.328199e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.369761e+04    1 256
+4.426783e+04    2 256
+4.443961e+04    4 256
+### CPU: scaling test 32
+4.205894e+04    1  32
+4.154644e+04    2  32
+4.180789e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.635620e+04    1 256
+8.373531e+04    2 256
+8.654539e+04    4 256
+### CPU: scaling test 32
+8.995865e+04    1  32
+8.789712e+04    2  32
+8.901054e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.711265e+04    1 256
+9.722643e+04    2 256
+9.347803e+04    4 256
+### CPU: scaling test 32
+9.518909e+04    1  32
+9.721140e+04    2  32
+9.724959e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.678497e+04    1 256
+6.627189e+04    2 256
+6.803332e+04    4 256
+### CPU: scaling test 32
+6.749432e+04    1  32
+6.701283e+04    2  32
+6.598727e+04    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
index 76636470b0..5ea3c579b2 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,272 +10,231 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:38:23
+DATE: 2025-10-11_16:29:39
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.472313e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.180220e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.180220e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.808698e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.065448e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.065448e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.504857 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,064,539,289      cycles                           #    2.862 GHz                    
-     3,123,566,672      instructions                     #    1.51  insn per cycle         
-       0.778239097 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+TOTAL       :     0.500490 sec
+     2,152,747,639      cycles                           #    2.835 GHz                       
+     3,089,120,012      instructions                     #    1.43  insn per cycle            
+       0.817131761 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.683325e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.341961e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.341961e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.720979e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.001076e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.001076e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.833212 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,141,452,514      cycles                           #    2.889 GHz                    
-     4,965,295,428      instructions                     #    1.58  insn per cycle         
-       1.145190233 seconds time elapsed
+TOTAL       :     0.786088 sec
+     3,079,796,138      cycles                           #    2.856 GHz                       
+     4,693,820,986      instructions                     #    1.52  insn per cycle            
+       1.137301736 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.413122e+00
-Avg ME (F77/GPU)   = 1.4131213684418649
-Relative difference = 4.469239988637851e-07
+Avg ME (F77/GPU)   = 1.4131213684418646
+Relative difference = 4.4692399902091566e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.439308e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.451643e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.451643e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.340726e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.352294e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.352294e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.745227 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    19,922,326,116      cycles                           #    2.952 GHz                    
-    59,921,657,661      instructions                     #    3.01  insn per cycle         
-       6.749767217 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1199) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.027688 sec
+    20,121,022,602      cycles                           #    2.862 GHz                       
+    60,520,827,051      instructions                     #    3.01  insn per cycle            
+       7.031786887 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684432433
 Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.590762e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.634359e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.634359e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.433303e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.475603e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.475603e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.596308 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    10,606,556,243      cycles                           #    2.946 GHz                    
-    31,132,640,347      instructions                     #    2.94  insn per cycle         
-       3.600784290 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5221) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.724019 sec
+    10,754,955,259      cycles                           #    2.886 GHz                       
+    31,220,075,253      instructions                     #    2.90  insn per cycle            
+       3.728441609 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684432433
 Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.045361e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.212711e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.212711e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.799230e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.961399e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.961399e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.840181 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     5,054,435,549      cycles                           #    2.741 GHz                    
-    11,457,891,523      instructions                     #    2.27  insn per cycle         
-       1.844724432 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4642) (512y:    0) (512z:    0)
+TOTAL       :     1.890149 sec
+     5,120,442,526      cycles                           #    2.704 GHz                       
+    11,558,215,171      instructions                     #    2.26  insn per cycle            
+       1.894456584 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4658) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.028589e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.049854e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.049854e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.595269e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.785975e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.785975e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.621206 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,484,828,096      cycles                           #    2.760 GHz                    
-    10,715,944,638      instructions                     #    2.39  insn per cycle         
-       1.625802151 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4378) (512y:   92) (512z:    0)
+TOTAL       :     1.735302 sec
+     4,701,578,061      cycles                           #    2.704 GHz                       
+    10,861,447,059      instructions                     #    2.31  insn per cycle            
+       1.739681098 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4482) (512y:   57) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.165257e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.268564e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.268564e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.737162e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.834485e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.834485e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.316443 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,162,925,866      cycles                           #    1.795 GHz                    
-     6,008,954,577      instructions                     #    1.44  insn per cycle         
-       2.321140123 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1620) (512y:   94) (512z: 3577)
+TOTAL       :     2.462185 sec
+     4,238,690,147      cycles                           #    1.719 GHz                       
+     6,064,850,138      instructions                     #    1.43  insn per cycle            
+       2.466509903 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1720) (512y:   63) (512z: 3552)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416484
 Relative difference = 4.469241520660492e-07
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
index 49402063e2..2fc1d7dc04 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:00:40
+DATE: 2025-10-11_15:20:41
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.575064e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.921304e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.028957e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.786288e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.203485e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.221467e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.477703 sec
-INFO: No Floating Point Exceptions have been reported
-     1,994,590,518      cycles                           #    2.865 GHz                    
-     2,848,992,929      instructions                     #    1.43  insn per cycle         
-       0.754407053 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.470896 sec
+     2,028,123,419      cycles                           #    2.825 GHz                       
+     2,812,031,573      instructions                     #    1.39  insn per cycle            
+       0.775558684 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.042325e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.231825e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.242712e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.146437e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.383510e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.397548e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.616679 sec
-INFO: No Floating Point Exceptions have been reported
-     2,463,746,118      cycles                           #    2.874 GHz                    
-     3,716,874,386      instructions                     #    1.51  insn per cycle         
-       0.917442132 seconds time elapsed
+TOTAL       :     0.569288 sec
+     2,428,652,206      cycles                           #    2.852 GHz                       
+     3,427,874,591      instructions                     #    1.41  insn per cycle            
+       0.912714324 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.413122e+00
-Avg ME (F77/GPU)   = 1.4131213684418649
-Relative difference = 4.469239988637851e-07
+Avg ME (F77/GPU)   = 1.4131213684418646
+Relative difference = 4.4692399902091566e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.437110e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.449363e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.449363e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.386609e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.398461e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.398461e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.744189 sec
-INFO: No Floating Point Exceptions have been reported
-    19,899,963,729      cycles                           #    2.950 GHz                    
-    60,130,622,589      instructions                     #    3.02  insn per cycle         
-       6.748077481 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1322) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.886307 sec
+    19,965,917,518      cycles                           #    2.898 GHz                       
+    60,201,240,687      instructions                     #    3.02  insn per cycle            
+       6.890252778 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684432433
 Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.632122e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.676125e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.676125e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.533737e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.576916e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.576916e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.557509 sec
-INFO: No Floating Point Exceptions have been reported
-    10,482,296,489      cycles                           #    2.944 GHz                    
-    30,686,942,862      instructions                     #    2.93  insn per cycle         
-       3.561419011 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5047) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.633851 sec
+    10,579,683,505      cycles                           #    2.909 GHz                       
+    30,847,655,837      instructions                     #    2.92  insn per cycle            
+       3.638097883 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4930) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684432433
 Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.842314e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.999775e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.999775e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.536026e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.682366e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.682366e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.874706 sec
-INFO: No Floating Point Exceptions have been reported
-     5,138,957,277      cycles                           #    2.738 GHz                    
-    11,840,408,683      instructions                     #    2.30  insn per cycle         
-       1.878700358 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4746) (512y:    0) (512z:    0)
+TOTAL       :     1.939515 sec
+     5,249,266,634      cycles                           #    2.702 GHz                       
+    11,982,858,846      instructions                     #    2.28  insn per cycle            
+       1.943675108 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4772) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.602387e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.789550e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.789550e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.187873e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.358429e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.358429e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.727070 sec
-INFO: No Floating Point Exceptions have been reported
-     4,726,480,466      cycles                           #    2.731 GHz                    
-    11,165,052,550      instructions                     #    2.36  insn per cycle         
-       1.731070886 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4403) (512y:  246) (512z:    0)
+TOTAL       :     1.803322 sec
+     4,846,320,602      cycles                           #    2.683 GHz                       
+    11,310,325,393      instructions                     #    2.33  insn per cycle            
+       1.807176987 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4455) (512y:  231) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.101185e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.203049e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.203049e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.783861e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.878450e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.878450e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.329881 sec
-INFO: No Floating Point Exceptions have been reported
-     4,155,200,887      cycles                           #    1.781 GHz                    
-     6,223,800,996      instructions                     #    1.50  insn per cycle         
-       2.334090572 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1516) (512y:  139) (512z: 3679)
+TOTAL       :     2.437468 sec
+     4,222,471,079      cycles                           #    1.730 GHz                       
+     6,310,155,112      instructions                     #    1.49  insn per cycle            
+       2.441536708 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1619) (512y:  119) (512z: 3648)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416484
 Relative difference = 4.469241520660492e-07
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..66fa52db02
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+DATE: 2025-10-11_15:42:24
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.020563e+06    1 256
+1.907125e+06    2 256
+3.779714e+06    4 256
+7.211953e+06    8 256
+1.376478e+07   16 256
+2.148631e+07   32 256
+2.475235e+07   64 256
+2.658152e+07  128 256
+2.709334e+07  256 256
+2.813503e+07  512 256
+2.865513e+07 1024 256
+### GPU: scaling test 32
+1.249239e+05    1  32
+2.576023e+05    2  32
+5.236416e+05    4  32
+9.816703e+05    8  32
+1.909308e+06   16  32
+3.564529e+06   32  32
+7.104303e+06   64  32
+1.425315e+07  128  32
+2.099087e+07  256  32
+2.446553e+07  512  32
+2.604809e+07 1024  32
+2.693465e+07 2048  32
+2.780197e+07 4096  32
+2.832618e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.475086e+04    1 256
+2.477196e+04    2 256
+2.498053e+04    4 256
+### CPU: scaling test 32
+2.306794e+04    1  32
+2.472476e+04    2  32
+2.481117e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.800127e+04    1 256
+7.895709e+04    2 256
+7.905572e+04    4 256
+### CPU: scaling test 32
+7.190850e+04    1  32
+7.327190e+04    2  32
+7.683355e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.743170e+05    1 256
+1.714585e+05    2 256
+1.739702e+05    4 256
+### CPU: scaling test 32
+1.605789e+05    1  32
+1.673207e+05    2  32
+1.747798e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.847081e+05    1 256
+1.886928e+05    2 256
+1.844591e+05    4 256
+### CPU: scaling test 32
+1.678389e+05    1  32
+1.901615e+05    2  32
+1.805064e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.398580e+05    1 256
+1.377336e+05    2 256
+1.394286e+05    4 256
+### CPU: scaling test 32
+1.350638e+05    1  32
+1.419406e+05    2  32
+1.392215e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index b4d9344f80..359e7877d9 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,226 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:01:57
+DATE: 2025-10-11_15:22:22
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.641235e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.015793e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.057654e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.457409 sec
-INFO: No Floating Point Exceptions have been reported
-     1,937,244,275      cycles                           #    2.867 GHz                    
-     2,710,892,637      instructions                     #    1.40  insn per cycle         
-       0.733854811 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 226
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 2.012111e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.590020e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.652888e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002446e+01 )  GeV^-2
+TOTAL       :     0.461660 sec
+     2,024,209,134      cycles                           #    2.804 GHz                       
+     2,785,160,230      instructions                     #    1.38  insn per cycle            
+       0.779091198 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 211
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.672412e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.384843e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.427387e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.304364e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.823335e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.855285e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630097e+02 +- 4.770717e+02 )  GeV^-2
-TOTAL       :     0.509900 sec
-INFO: No Floating Point Exceptions have been reported
-     2,162,696,786      cycles                           #    2.871 GHz                    
-     3,100,226,347      instructions                     #    1.43  insn per cycle         
-       0.811215095 seconds time elapsed
+TOTAL       :     0.506727 sec
+     2,201,759,148      cycles                           #    2.852 GHz                       
+     3,068,173,195      instructions                     #    1.39  insn per cycle            
+       0.828420263 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.412607e+00
-Avg ME (F77/GPU)   = 1.4132214305330990
-Relative difference = 0.0004349621183379836
+Avg ME (F77/GPU)   = 1.4132214458495582
+Relative difference = 0.0004349729610275725
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.513642e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.526564e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.526564e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.501069e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.514090e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.514090e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.537186 sec
-INFO: No Floating Point Exceptions have been reported
-    19,278,711,706      cycles                           #    2.948 GHz                    
-    59,616,757,005      instructions                     #    3.09  insn per cycle         
-       6.541004954 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  959) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.569879 sec
+    19,152,579,978      cycles                           #    2.914 GHz                       
+    59,680,745,465      instructions                     #    3.12  insn per cycle            
+       6.573833440 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  926) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
 Avg ME (F77/C++)    = 1.4129949096991936
 Relative difference = 6.390737857384068e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.120315e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.259615e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.259615e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.920524e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.053952e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.053952e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     2.035209 sec
-INFO: No Floating Point Exceptions have been reported
-     6,010,527,138      cycles                           #    2.949 GHz                    
-    17,061,942,080      instructions                     #    2.84  insn per cycle         
-       2.038918474 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5856) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.086277 sec
+     6,057,068,110      cycles                           #    2.899 GHz                       
+    17,105,898,955      instructions                     #    2.82  insn per cycle            
+       2.090214636 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
-Avg ME (F77/C++)    = 1.4129954647353316
-Relative difference = 3.2890090308261873e-07
+Avg ME (F77/C++)    = 1.4129954481297773
+Relative difference = 3.171488768794332e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.748972e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.811746e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.811746e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.680104e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.737565e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.737565e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.954915 sec
-INFO: No Floating Point Exceptions have been reported
-     2,640,169,352      cycles                           #    2.756 GHz                    
-     6,187,458,591      instructions                     #    2.34  insn per cycle         
-       0.958678404 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5105) (512y:    0) (512z:    0)
+TOTAL       :     0.993425 sec
+     2,677,007,034      cycles                           #    2.687 GHz                       
+     6,240,512,600      instructions                     #    2.33  insn per cycle            
+       0.997226702 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5122) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133132974634464
+Relative difference = 2.104724475889719e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.923079e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.998771e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.998771e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.843149e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.912179e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.912179e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.870044 sec
-INFO: No Floating Point Exceptions have been reported
-     2,402,321,989      cycles                           #    2.751 GHz                    
-     5,790,080,813      instructions                     #    2.41  insn per cycle         
-       0.873863245 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4906) (512y:   37) (512z:    0)
+TOTAL       :     0.907079 sec
+     2,478,306,991      cycles                           #    2.723 GHz                       
+     5,867,870,372      instructions                     #    2.37  insn per cycle            
+       0.910927509 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5009) (512y:    2) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133132974634464
+Relative difference = 2.104724475889719e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.455132e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.498332e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.498332e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.382994e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.423338e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.423338e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.146003 sec
-INFO: No Floating Point Exceptions have been reported
-     2,072,911,951      cycles                           #    1.804 GHz                    
-     3,391,607,808      instructions                     #    1.64  insn per cycle         
-       1.149850121 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2237) (512y:   37) (512z: 3789)
+TOTAL       :     1.206279 sec
+     2,116,978,988      cycles                           #    1.750 GHz                       
+     3,424,879,930      instructions                     #    1.62  insn per cycle            
+       1.210305817 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2346) (512y:    7) (512z: 3767)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413316e+00
-Avg ME (F77/C++)    = 1.4133164033579249
-Relative difference = 2.85398258307829e-07
+Avg ME (F77/C++)    = 1.4133162104498354
+Relative difference = 1.48905011572879e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..03b7dc0471
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+DATE: 2025-10-11_15:58:16
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+3.727486e+05    1 256
+7.374228e+05    2 256
+1.359495e+06    4 256
+2.228941e+06    8 256
+3.376485e+06   16 256
+4.469020e+06   32 256
+5.249324e+06   64 256
+5.869764e+06  128 256
+6.094954e+06  256 256
+6.260097e+06  512 256
+6.357949e+06 1024 256
+### GPU: scaling test 32
+5.112115e+04    1  32
+9.374377e+04    2  32
+1.887009e+05    4  32
+3.960359e+05    8  32
+7.300603e+05   16  32
+1.308116e+06   32  32
+1.995847e+06   64  32
+3.417585e+06  128  32
+4.455777e+06  256  32
+5.284200e+06  512  32
+5.826269e+06 1024  32
+6.082445e+06 2048  32
+6.255269e+06 4096  32
+6.329872e+06 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.438060e+04    1 256
+2.470219e+04    2 256
+2.476066e+04    4 256
+### CPU: scaling test 32
+2.461887e+04    1  32
+2.470134e+04    2  32
+2.410740e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.129456e+04    1 256
+7.835869e+04    2 256
+7.787307e+04    4 256
+### CPU: scaling test 32
+6.724611e+04    1  32
+6.848385e+04    2  32
+7.303564e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.606597e+05    1 256
+1.630584e+05    2 256
+1.606208e+05    4 256
+### CPU: scaling test 32
+1.551508e+05    1  32
+1.588322e+05    2  32
+1.636465e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.742285e+05    1 256
+1.758288e+05    2 256
+1.738872e+05    4 256
+### CPU: scaling test 32
+1.750902e+05    1  32
+1.718448e+05    2  32
+1.870659e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.405438e+05    1 256
+1.389272e+05    2 256
+1.380473e+05    4 256
+### CPU: scaling test 32
+1.416732e+05    1  32
+1.383910e+05    2  32
+1.393492e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
index 89f1af02c0..b34d8177c5 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,275 +10,234 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:38:49
+DATE: 2025-10-11_16:30:12
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.430077e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.496267e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.496267e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 1.009071e+02 +- 5.002295e+01 )  GeV^-2
-TOTAL       :     0.468595 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,993,529,576      cycles                           #    2.878 GHz                    
-     2,894,144,626      instructions                     #    1.45  insn per cycle         
-       0.749153323 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+EvtsPerSec[Rmb+ME]     (23) = ( 4.563182e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.822216e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.822216e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 1.009070e+02 +- 5.002294e+01 )  GeV^-2
+TOTAL       :     0.474333 sec
+     2,020,095,914      cycles                           #    2.815 GHz                       
+     2,863,432,755      instructions                     #    1.42  insn per cycle            
+       0.775295436 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 211
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 226
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.508973e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.254431e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.254431e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.400607e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.017646e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.017646e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.737499e+02 +- 4.776369e+02 )  GeV^-2
-TOTAL       :     0.658615 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,580,648,777      cycles                           #    2.888 GHz                    
-     3,894,936,658      instructions                     #    1.51  insn per cycle         
-       0.952346890 seconds time elapsed
+TOTAL       :     0.650114 sec
+     2,601,943,365      cycles                           #    2.840 GHz                       
+     3,913,396,482      instructions                     #    1.50  insn per cycle            
+       0.976170377 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.412607e+00
-Avg ME (F77/GPU)   = 1.4132214305330990
-Relative difference = 0.0004349621183379836
+Avg ME (F77/GPU)   = 1.4132214458495582
+Relative difference = 0.0004349729610275725
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.506368e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.519408e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.519408e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.486527e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.499486e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.499486e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.559975 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    19,280,512,048      cycles                           #    2.938 GHz                    
-    59,619,141,119      instructions                     #    3.09  insn per cycle         
-       6.564243260 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  959) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.611886 sec
+    19,177,870,695      cycles                           #    2.899 GHz                       
+    59,684,285,229      instructions                     #    3.11  insn per cycle            
+       6.615966746 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  926) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
 Avg ME (F77/C++)    = 1.4129949096991936
 Relative difference = 6.390737857384068e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.092271e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.230160e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.230160e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.840675e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.974875e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.974875e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     2.047307 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,043,775,465      cycles                           #    2.947 GHz                    
-    17,111,089,922      instructions                     #    2.83  insn per cycle         
-       2.051614364 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5856) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.112189 sec
+     6,078,517,802      cycles                           #    2.874 GHz                       
+    17,153,031,314      instructions                     #    2.82  insn per cycle            
+       2.116275288 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
-Avg ME (F77/C++)    = 1.4129954647353316
-Relative difference = 3.2890090308261873e-07
+Avg ME (F77/C++)    = 1.4129954481297773
+Relative difference = 3.171488768794332e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.748354e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.809701e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.809701e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.674765e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.733725e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.733725e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.959425 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,659,679,748      cycles                           #    2.761 GHz                    
-     6,224,393,438      instructions                     #    2.34  insn per cycle         
-       0.963869172 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5105) (512y:    0) (512z:    0)
+TOTAL       :     1.001010 sec
+     2,696,240,098      cycles                           #    2.685 GHz                       
+     6,276,404,164      instructions                     #    2.33  insn per cycle            
+       1.005076444 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5122) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133132974634464
+Relative difference = 2.104724475889719e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.927524e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.002486e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.002486e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.832147e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.902384e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.902384e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.872058 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,421,094,163      cycles                           #    2.765 GHz                    
-     5,826,830,021      instructions                     #    2.41  insn per cycle         
-       0.876372578 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4906) (512y:   37) (512z:    0)
+TOTAL       :     0.916582 sec
+     2,498,079,452      cycles                           #    2.717 GHz                       
+     5,903,755,317      instructions                     #    2.36  insn per cycle            
+       0.920755361 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5009) (512y:    2) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133132974634464
+Relative difference = 2.104724475889719e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.443486e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.486864e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.486864e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.388850e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.429977e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.429977e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.160150 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,098,432,349      cycles                           #    1.804 GHz                    
-     3,433,067,927      instructions                     #    1.64  insn per cycle         
-       1.164579445 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2237) (512y:   37) (512z: 3789)
+TOTAL       :     1.204887 sec
+     2,137,027,835      cycles                           #    1.769 GHz                       
+     3,465,402,298      instructions                     #    1.62  insn per cycle            
+       1.209022745 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2346) (512y:    7) (512z: 3767)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413316e+00
-Avg ME (F77/C++)    = 1.4133164033579249
-Relative difference = 2.85398258307829e-07
+Avg ME (F77/C++)    = 1.4133162104498354
+Relative difference = 1.48905011572879e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
index 7537d3c84d..1d664001ba 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,226 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:02:18
+DATE: 2025-10-11_15:22:52
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.658659e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.027503e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.066373e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.462988 sec
-INFO: No Floating Point Exceptions have been reported
-     1,956,715,427      cycles                           #    2.872 GHz                    
-     2,757,694,861      instructions                     #    1.41  insn per cycle         
-       0.742544959 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 226
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 1.986981e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.577936e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.642909e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002446e+01 )  GeV^-2
+TOTAL       :     0.465752 sec
+     2,027,464,804      cycles                           #    2.839 GHz                       
+     2,776,602,524      instructions                     #    1.37  insn per cycle            
+       0.772091406 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 203
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.669827e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.371215e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.415741e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.311817e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.830173e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.862677e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630097e+02 +- 4.770717e+02 )  GeV^-2
-TOTAL       :     0.505811 sec
-INFO: No Floating Point Exceptions have been reported
-     2,123,611,289      cycles                           #    2.883 GHz                    
-     3,083,974,467      instructions                     #    1.45  insn per cycle         
-       0.793454464 seconds time elapsed
+TOTAL       :     0.507862 sec
+     2,193,078,964      cycles                           #    2.843 GHz                       
+     3,061,556,319      instructions                     #    1.40  insn per cycle            
+       0.829701653 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.412607e+00
-Avg ME (F77/GPU)   = 1.4132214305330990
-Relative difference = 0.0004349621183379836
+Avg ME (F77/GPU)   = 1.4132214458495582
+Relative difference = 0.0004349729610275725
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.488365e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.501255e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.501255e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.494083e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.506993e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.506993e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.602997 sec
-INFO: No Floating Point Exceptions have been reported
-    19,409,400,884      cycles                           #    2.938 GHz                    
-    59,351,848,666      instructions                     #    3.06  insn per cycle         
-       6.606759387 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1027) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.588418 sec
+    19,053,983,564      cycles                           #    2.891 GHz                       
+    59,396,932,644      instructions                     #    3.12  insn per cycle            
+       6.592397812 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  868) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
 Avg ME (F77/C++)    = 1.4129949096991936
 Relative difference = 6.390737857384068e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.484090e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.633368e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.633368e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.236693e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.382500e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.382500e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     1.948837 sec
-INFO: No Floating Point Exceptions have been reported
-     5,764,162,956      cycles                           #    2.953 GHz                    
-    16,849,716,772      instructions                     #    2.92  insn per cycle         
-       1.952678468 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5610) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.007204 sec
+     5,773,782,949      cycles                           #    2.872 GHz                       
+    16,883,450,737      instructions                     #    2.92  insn per cycle            
+       2.011190459 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5486) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
-Avg ME (F77/C++)    = 1.4129954647353316
-Relative difference = 3.2890090308261873e-07
+Avg ME (F77/C++)    = 1.4129954481297773
+Relative difference = 3.171488768794332e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.522405e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.569181e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.569181e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.456033e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.499646e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.499646e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.094041 sec
-INFO: No Floating Point Exceptions have been reported
-     3,018,102,108      cycles                           #    2.750 GHz                    
-     6,848,568,360      instructions                     #    2.27  insn per cycle         
-       1.098202042 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5735) (512y:    0) (512z:    0)
+TOTAL       :     1.143466 sec
+     3,080,089,782      cycles                           #    2.686 GHz                       
+     6,901,917,276      instructions                     #    2.24  insn per cycle            
+       1.147397013 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5760) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133132974634464
+Relative difference = 2.104724475889719e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.654265e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.710055e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.710055e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.551832e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.601891e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.601891e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.008735 sec
-INFO: No Floating Point Exceptions have been reported
-     2,794,533,058      cycles                           #    2.762 GHz                    
-     6,437,695,564      instructions                     #    2.30  insn per cycle         
-       1.012558685 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5509) (512y:   23) (512z:    0)
+TOTAL       :     1.074026 sec
+     2,869,050,546      cycles                           #    2.664 GHz                       
+     6,490,617,462      instructions                     #    2.26  insn per cycle            
+       1.077819814 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5562) (512y:    8) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133132974634464
+Relative difference = 2.104724475889719e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.323435e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.360072e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.360072e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.278723e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.313246e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.313246e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.258302 sec
-INFO: No Floating Point Exceptions have been reported
-     2,251,923,496      cycles                           #    1.787 GHz                    
-     3,755,291,572      instructions                     #    1.67  insn per cycle         
-       1.262174564 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2467) (512y:   28) (512z: 4084)
+TOTAL       :     1.301798 sec
+     2,284,363,028      cycles                           #    1.751 GHz                       
+     3,800,071,631      instructions                     #    1.66  insn per cycle            
+       1.305803750 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2577) (512y:    9) (512z: 4061)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413316e+00
-Avg ME (F77/C++)    = 1.4133164033579249
-Relative difference = 2.85398258307829e-07
+Avg ME (F77/C++)    = 1.4133162104498354
+Relative difference = 1.48905011572879e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..61f28ab393
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+DATE: 2025-10-11_15:42:03
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+9.413980e+05    1 256
+1.824479e+06    2 256
+3.751768e+06    4 256
+6.821687e+06    8 256
+8.893057e+06   16 256
+1.069198e+07   32 256
+1.203562e+07   64 256
+1.299650e+07  128 256
+1.326879e+07  256 256
+1.353754e+07  512 256
+1.376766e+07 1024 256
+### GPU: scaling test 32
+1.264842e+05    1  32
+2.411881e+05    2  32
+5.002345e+05    4  32
+8.959915e+05    8  32
+1.929825e+06   16  32
+3.400412e+06   32  32
+6.965891e+06   64  32
+9.374242e+06  128  32
+1.031547e+07  256  32
+1.114517e+07  512  32
+1.169216e+07 1024  32
+1.186544e+07 2048  32
+1.211002e+07 4096  32
+1.215036e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.309135e+04    1 256
+2.331383e+04    2 256
+2.334383e+04    4 256
+### CPU: scaling test 32
+2.173266e+04    1  32
+2.264555e+04    2  32
+2.214409e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.454087e+04    1 256
+4.509478e+04    2 256
+4.547146e+04    4 256
+### CPU: scaling test 32
+4.000635e+04    1  32
+4.240489e+04    2  32
+4.447787e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.989478e+04    1 256
+8.788512e+04    2 256
+9.013990e+04    4 256
+### CPU: scaling test 32
+9.025857e+04    1  32
+9.054908e+04    2  32
+8.932416e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.982270e+04    1 256
+9.959330e+04    2 256
+9.964108e+04    4 256
+### CPU: scaling test 32
+9.318362e+04    1  32
+1.002699e+05    2  32
+9.968832e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.767141e+04    1 256
+6.818529e+04    2 256
+6.881658e+04    4 256
+### CPU: scaling test 32
+6.813396e+04    1  32
+6.831571e+04    2  32
+6.860475e+04    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index 6b4617ba56..66176b2229 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:01:06
+DATE: 2025-10-11_15:21:14
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.531107e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.896113e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.014318e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.723520e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.201379e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.219641e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.482291 sec
-INFO: No Floating Point Exceptions have been reported
-     1,996,726,100      cycles                           #    2.869 GHz                    
-     2,875,927,393      instructions                     #    1.44  insn per cycle         
-       0.757518934 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.472516 sec
+     2,054,090,006      cycles                           #    2.841 GHz                       
+     2,817,756,219      instructions                     #    1.37  insn per cycle            
+       0.780308929 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 44
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.039985e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.227093e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.238483e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.127139e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.354786e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.367576e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.618226 sec
-INFO: No Floating Point Exceptions have been reported
-     2,476,524,825      cycles                           #    2.883 GHz                    
-     3,787,822,568      instructions                     #    1.53  insn per cycle         
-       0.918414719 seconds time elapsed
+TOTAL       :     0.567470 sec
+     2,434,469,025      cycles                           #    2.854 GHz                       
+     3,429,413,924      instructions                     #    1.41  insn per cycle            
+       0.911221936 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.413122e+00
-Avg ME (F77/GPU)   = 1.4131213755569487
-Relative difference = 4.418889885423659e-07
+Avg ME (F77/GPU)   = 1.4131213912822083
+Relative difference = 4.3076096170606456e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.396101e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.408087e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.408087e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.325558e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.336921e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.336921e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.859626 sec
-INFO: No Floating Point Exceptions have been reported
-    20,206,369,377      cycles                           #    2.945 GHz                    
-    60,950,595,896      instructions                     #    3.02  insn per cycle         
-       6.863727850 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1220) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.066864 sec
+    20,436,241,353      cycles                           #    2.891 GHz                       
+    61,613,414,820      instructions                     #    3.01  insn per cycle            
+       7.070927861 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213859069593
 Relative difference = 4.345647726386255e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.651759e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.695029e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.695029e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.581252e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.624148e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.624148e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.542669 sec
-INFO: No Floating Point Exceptions have been reported
-    10,470,195,857      cycles                           #    2.953 GHz                    
-    30,822,635,750      instructions                     #    2.94  insn per cycle         
-       3.546724112 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5351) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.596315 sec
+    10,491,200,280      cycles                           #    2.915 GHz                       
+    30,713,063,869      instructions                     #    2.93  insn per cycle            
+       3.600269209 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5149) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213792564823
-Relative difference = 4.392710025734405e-07
+Avg ME (F77/C++)    = 1.4131213813302705
+Relative difference = 4.3780348012864624e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.177717e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.345070e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.345070e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.021587e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.189187e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.189187e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.805877 sec
-INFO: No Floating Point Exceptions have been reported
-     4,960,900,655      cycles                           #    2.742 GHz                    
-    11,360,293,322      instructions                     #    2.29  insn per cycle         
-       1.809915904 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4776) (512y:    0) (512z:    0)
+TOTAL       :     1.836324 sec
+     4,963,572,150      cycles                           #    2.698 GHz                       
+    11,329,877,800      instructions                     #    2.28  insn per cycle            
+       1.840366477 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4650) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213600217192
-Relative difference = 4.5288254008796884e-07
+Avg ME (F77/C++)    = 1.4131213646773610
+Relative difference = 4.495879612249832e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.047166e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.068679e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.068679e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.809724e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.000340e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.000340e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.585052 sec
-INFO: No Floating Point Exceptions have been reported
-     4,379,448,731      cycles                           #    2.757 GHz                    
-    10,610,063,505      instructions                     #    2.42  insn per cycle         
-       1.588995755 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4503) (512y:   84) (512z:    0)
+TOTAL       :     1.690468 sec
+     4,546,028,597      cycles                           #    2.684 GHz                       
+    10,641,089,172      instructions                     #    2.34  insn per cycle            
+       1.694422805 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4468) (512y:   47) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213600217192
-Relative difference = 4.5288254008796884e-07
+Avg ME (F77/C++)    = 1.4131213646773610
+Relative difference = 4.495879612249832e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.890582e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.987179e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.987179e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.931835e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.029866e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.029866e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.401138 sec
-INFO: No Floating Point Exceptions have been reported
-     4,243,505,288      cycles                           #    1.765 GHz                    
-     6,171,567,257      instructions                     #    1.45  insn per cycle         
-       2.405218093 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2143) (512y:  116) (512z: 3653)
+TOTAL       :     2.386097 sec
+     4,162,019,401      cycles                           #    1.742 GHz                       
+     5,999,960,287      instructions                     #    1.44  insn per cycle            
+       2.390275923 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1724) (512y:   63) (512z: 3594)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213786174055
 Relative difference = 4.3972324717191576e-07
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..d8428305ae
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+DATE: 2025-10-11_15:57:35
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.849872e+05    1 256
+5.950036e+05    2 256
+1.135532e+06    4 256
+9.336754e+05    8 256
+2.668945e+06   16 256
+3.526097e+06   32 256
+4.045575e+06   64 256
+4.557983e+06  128 256
+4.782891e+06  256 256
+4.835057e+06  512 256
+4.861240e+06 1024 256
+### GPU: scaling test 32
+3.826136e+04    1  32
+7.325127e+04    2  32
+1.481027e+05    4  32
+3.040622e+05    8  32
+6.040500e+05   16  32
+1.089306e+06   32  32
+1.777835e+06   64  32
+2.826455e+06  128  32
+3.481738e+06  256  32
+3.995216e+06  512  32
+4.416099e+06 1024  32
+4.561881e+06 2048  32
+4.594627e+06 4096  32
+4.620875e+06 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.314037e+04    1 256
+2.324071e+04    2 256
+2.351748e+04    4 256
+### CPU: scaling test 32
+2.156289e+04    1  32
+2.224284e+04    2  32
+2.270647e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.464955e+04    1 256
+4.456312e+04    2 256
+4.557593e+04    4 256
+### CPU: scaling test 32
+3.776841e+04    1  32
+4.243663e+04    2  32
+4.407623e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.329077e+04    1 256
+8.946504e+04    2 256
+8.934937e+04    4 256
+### CPU: scaling test 32
+8.542423e+04    1  32
+9.061011e+04    2  32
+9.100728e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.619475e+04    1 256
+1.000794e+05    2 256
+9.841918e+04    4 256
+### CPU: scaling test 32
+9.793151e+04    1  32
+9.901818e+04    2  32
+9.971627e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.804216e+04    1 256
+6.812091e+04    2 256
+6.863263e+04    4 256
+### CPU: scaling test 32
+6.817141e+04    1  32
+6.704119e+04    2  32
+6.858619e+04    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
index 1a268fb0a6..b5540e725a 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:01:31
+DATE: 2025-10-11_15:21:49
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.506525e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.876419e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.986419e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.729045e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.193827e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.214345e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.475723 sec
-INFO: No Floating Point Exceptions have been reported
-     1,989,777,196      cycles                           #    2.876 GHz                    
-     2,865,221,599      instructions                     #    1.44  insn per cycle         
-       0.750464789 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.476302 sec
+     2,069,585,848      cycles                           #    2.841 GHz                       
+     2,809,792,568      instructions                     #    1.36  insn per cycle            
+       0.788016398 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 44
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.040967e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.229706e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.240646e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.148157e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.386565e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.400273e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.612359 sec
-INFO: No Floating Point Exceptions have been reported
-     2,465,408,367      cycles                           #    2.885 GHz                    
-     3,759,784,229      instructions                     #    1.53  insn per cycle         
-       0.914073870 seconds time elapsed
+TOTAL       :     0.562536 sec
+     2,368,600,308      cycles                           #    2.829 GHz                       
+     3,390,907,468      instructions                     #    1.43  insn per cycle            
+       0.897403591 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.413122e+00
-Avg ME (F77/GPU)   = 1.4131213755569487
-Relative difference = 4.418889885423659e-07
+Avg ME (F77/GPU)   = 1.4131213912822083
+Relative difference = 4.3076096170606456e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.395973e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.407808e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.407808e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.347035e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.358476e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.358476e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.859771 sec
-INFO: No Floating Point Exceptions have been reported
-    20,239,178,144      cycles                           #    2.949 GHz                    
-    61,173,779,461      instructions                     #    3.02  insn per cycle         
-       6.863706451 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1272) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.001676 sec
+    20,340,735,873      cycles                           #    2.904 GHz                       
+    61,296,698,560      instructions                     #    3.01  insn per cycle            
+       7.005669304 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213859069593
 Relative difference = 4.345647726386255e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.702334e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.747762e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.747762e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.588929e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.632804e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.632804e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.505938 sec
-INFO: No Floating Point Exceptions have been reported
-    10,333,154,234      cycles                           #    2.946 GHz                    
-    30,534,348,115      instructions                     #    2.95  insn per cycle         
-       3.510016853 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5155) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.590204 sec
+    10,378,021,696      cycles                           #    2.888 GHz                       
+    30,395,025,188      instructions                     #    2.93  insn per cycle            
+       3.594207111 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4954) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213792564823
-Relative difference = 4.392710025734405e-07
+Avg ME (F77/C++)    = 1.4131213813302705
+Relative difference = 4.3780348012864624e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.861323e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.018375e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.018375e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.624880e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.780155e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.780155e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.870783 sec
-INFO: No Floating Point Exceptions have been reported
-     5,160,894,050      cycles                           #    2.755 GHz                    
-    11,875,310,688      instructions                     #    2.30  insn per cycle         
-       1.874839635 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4887) (512y:    0) (512z:    0)
+TOTAL       :     1.920064 sec
+     5,168,529,008      cycles                           #    2.687 GHz                       
+    11,822,995,259      instructions                     #    2.29  insn per cycle            
+       1.924192404 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4749) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213600217192
-Relative difference = 4.5288254008796884e-07
+Avg ME (F77/C++)    = 1.4131213646773610
+Relative difference = 4.495879612249832e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.768245e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.957717e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.957717e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.374636e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.559382e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.559382e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.697611 sec
-INFO: No Floating Point Exceptions have been reported
-     4,679,050,155      cycles                           #    2.751 GHz                    
-    11,168,862,734      instructions                     #    2.39  insn per cycle         
-       1.701628470 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4508) (512y:  239) (512z:    0)
+TOTAL       :     1.767863 sec
+     4,740,196,866      cycles                           #    2.676 GHz                       
+    11,146,224,662      instructions                     #    2.35  insn per cycle            
+       1.772001982 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4420) (512y:  221) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213600217192
-Relative difference = 4.5288254008796884e-07
+Avg ME (F77/C++)    = 1.4131213646773610
+Relative difference = 4.495879612249832e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.922687e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.020028e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.020028e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.914882e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.012925e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.012925e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.390116 sec
-INFO: No Floating Point Exceptions have been reported
-     4,256,907,095      cycles                           #    1.778 GHz                    
-     6,411,350,564      instructions                     #    1.51  insn per cycle         
-       2.394737171 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2039) (512y:  162) (512z: 3731)
+TOTAL       :     2.391894 sec
+     4,182,595,672      cycles                           #    1.747 GHz                       
+     6,238,269,996      instructions                     #    1.49  insn per cycle            
+       2.395956127 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1623) (512y:  120) (512z: 3678)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213786174055
 Relative difference = 4.3972324717191576e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..5a05ffd4cc
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_15:42:45
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.797622e+05    1 256
+3.709787e+05    2 256
+3.836692e+05    4 256
+4.274394e+05    8 256
+4.457291e+05   16 256
+4.426930e+05   32 256
+4.430121e+05   64 256
+4.414634e+05  128 256
+4.537983e+05  256 256
+4.587406e+05  512 256
+4.539498e+05 1024 256
+### GPU: scaling test 32
+5.646557e+04    1  32
+1.072891e+05    2  32
+1.807325e+05    4  32
+2.717613e+05    8  32
+3.826661e+05   16  32
+3.951829e+05   32  32
+4.316071e+05   64  32
+4.432349e+05  128  32
+4.449540e+05  256  32
+4.447744e+05  512  32
+4.444094e+05 1024  32
+4.520916e+05 2048  32
+4.578060e+05 4096  32
+4.571634e+05 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.852732e+03    1 256
+1.852838e+03    2 256
+1.863778e+03    4 256
+### CPU: scaling test 32
+1.849128e+03    1  32
+1.851000e+03    2  32
+1.853111e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.433326e+03    1 256
+3.428849e+03    2 256
+3.434375e+03    4 256
+### CPU: scaling test 32
+3.324011e+03    1  32
+3.385678e+03    2  32
+3.337661e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.888262e+03    1 256
+7.910674e+03    2 256
+7.940995e+03    4 256
+### CPU: scaling test 32
+7.181194e+03    1  32
+7.616753e+03    2  32
+7.493920e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.845276e+03    1 256
+8.896166e+03    2 256
+8.958296e+03    4 256
+### CPU: scaling test 32
+8.632795e+03    1  32
+8.574113e+03    2  32
+8.618805e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.742240e+03    1 256
+6.762831e+03    2 256
+6.833848e+03    4 256
+### CPU: scaling test 32
+6.602630e+03    1  32
+6.602109e+03    2  32
+6.640282e+03    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index fe9e9669c6..5da31552e6 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:02:40
+DATE: 2025-10-11_15:23:20
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.331120e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.359202e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.361250e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.393219e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.441536e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.444704e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.543918 sec
-INFO: No Floating Point Exceptions have been reported
-     2,225,694,406      cycles                           #    2.884 GHz                    
-     3,483,451,829      instructions                     #    1.57  insn per cycle         
-       0.837015502 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.499467 sec
+     2,136,562,888      cycles                           #    2.840 GHz                       
+     3,115,290,958      instructions                     #    1.46  insn per cycle            
+       0.813463478 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.134422e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.164730e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.165914e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.052190 sec
-INFO: No Floating Point Exceptions have been reported
-     9,689,726,748      cycles                           #    2.928 GHz                    
-    22,118,867,491      instructions                     #    2.28  insn per cycle         
-       3.368998161 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158122E-004
+Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.884002e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.884932e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.884932e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.853765e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.854661e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.854661e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.713540 sec
-INFO: No Floating Point Exceptions have been reported
-    25,683,805,881      cycles                           #    2.947 GHz                    
-    78,963,253,936      instructions                     #    3.07  insn per cycle         
-       8.717598721 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.853472 sec
+    25,658,433,103      cycles                           #    2.897 GHz                       
+    78,568,001,018      instructions                     #    3.06  insn per cycle            
+       8.857417932 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141133E-004
 Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.540501e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.543820e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.543820e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.376471e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.379465e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.379465e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.639000 sec
-INFO: No Floating Point Exceptions have been reported
-    13,090,618,968      cycles                           #    2.820 GHz                    
-    39,561,040,325      instructions                     #    3.02  insn per cycle         
-       4.644193645 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.863682 sec
+    13,076,523,489      cycles                           #    2.687 GHz                       
+    39,590,979,607      instructions                     #    3.03  insn per cycle            
+       4.867732270 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141122E-004
 Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.087246e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.103223e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.103223e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.895651e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.911901e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.911901e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.035017 sec
-INFO: No Floating Point Exceptions have been reported
-     5,608,597,608      cycles                           #    2.752 GHz                    
-    13,825,354,537      instructions                     #    2.47  insn per cycle         
-       2.039075619 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11520) (512y:    0) (512z:    0)
+TOTAL       :     2.083250 sec
+     5,645,439,415      cycles                           #    2.706 GHz                       
+    13,860,388,601      instructions                     #    2.46  insn per cycle            
+       2.087459740 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11552) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.190120e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.211201e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.211201e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.894010e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.914275e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.914275e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.791765 sec
-INFO: No Floating Point Exceptions have been reported
-     4,921,067,926      cycles                           #    2.743 GHz                    
-    12,507,200,724      instructions                     #    2.54  insn per cycle         
-       1.798123347 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10439) (512y:   89) (512z:    0)
+TOTAL       :     1.850375 sec
+     5,008,092,310      cycles                           #    2.702 GHz                       
+    12,556,513,170      instructions                     #    2.51  insn per cycle            
+       1.855114099 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10538) (512y:   54) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.012553e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.024911e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.024911e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.736940e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.749376e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.749376e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.347251 sec
-INFO: No Floating Point Exceptions have been reported
-     4,147,263,675      cycles                           #    1.765 GHz                    
-     6,394,266,077      instructions                     #    1.54  insn per cycle         
-       2.352573303 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1978) (512y:  101) (512z: 9386)
+TOTAL       :     2.440997 sec
+     4,200,411,405      cycles                           #    1.718 GHz                       
+     6,424,496,970      instructions                     #    1.53  insn per cycle            
+       2.445446290 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1980) (512y:   70) (512z: 9398)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..30ffb7f326
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_15:58:57
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.872973e+05    1 256
+2.845184e+05    2 256
+3.112851e+05    4 256
+3.602269e+05    8 256
+3.862982e+05   16 256
+3.927910e+05   32 256
+3.975811e+05   64 256
+3.994813e+05  128 256
+3.982764e+05  256 256
+4.044121e+05  512 256
+4.143519e+05 1024 256
+### GPU: scaling test 32
+3.147853e+04    1  32
+5.985873e+04    2  32
+1.086414e+05    4  32
+1.846072e+05    8  32
+2.795140e+05   16  32
+3.171308e+05   32  32
+3.664746e+05   64  32
+3.861934e+05  128  32
+3.935760e+05  256  32
+3.959241e+05  512  32
+3.999573e+05 1024  32
+4.014811e+05 2048  32
+4.043590e+05 4096  32
+4.145995e+05 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.851734e+03    1 256
+1.852841e+03    2 256
+1.858966e+03    4 256
+### CPU: scaling test 32
+1.839862e+03    1  32
+1.843418e+03    2  32
+1.855242e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.376740e+03    1 256
+3.427003e+03    2 256
+3.418754e+03    4 256
+### CPU: scaling test 32
+3.343494e+03    1  32
+3.346688e+03    2  32
+3.350028e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.930406e+03    1 256
+7.927403e+03    2 256
+7.830665e+03    4 256
+### CPU: scaling test 32
+7.705971e+03    1  32
+7.749828e+03    2  32
+7.499380e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.438432e+03    1 256
+8.876320e+03    2 256
+8.867251e+03    4 256
+### CPU: scaling test 32
+8.678830e+03    1  32
+8.575889e+03    2  32
+8.706424e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.649041e+03    1 256
+6.668160e+03    2 256
+6.667655e+03    4 256
+### CPU: scaling test 32
+6.543129e+03    1  32
+6.626562e+03    2  32
+6.609869e+03    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..ef3556442f
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_15:52:22
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 3.934631e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.970660e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.973586e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     1.207545 sec
+     4,504,483,186      cycles                           #    2.857 GHz                       
+     6,247,204,557      instructions                     #    1.39  insn per cycle            
+       1.634328522 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626675e-04
+Avg ME (F77/GPU)   = 6.6266731198158122E-004
+Relative difference = 2.837296513854949e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.840362e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.841255e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.841255e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     8.917657 sec
+    25,674,151,776      cycles                           #    2.878 GHz                       
+    78,572,254,617      instructions                     #    3.06  insn per cycle            
+       8.921718104 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198141133E-004
+Relative difference = 2.8372990776517314e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.319765e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.322676e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.322676e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     4.946260 sec
+    13,085,012,778      cycles                           #    2.644 GHz                       
+    39,592,390,137      instructions                     #    3.03  insn per cycle            
+       4.950371272 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198141122E-004
+Relative difference = 2.837299079287849e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 7.807824e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.823601e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.823601e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.106755 sec
+     5,651,241,480      cycles                           #    2.678 GHz                       
+    13,863,632,897      instructions                     #    2.45  insn per cycle            
+       2.110867653 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11552) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.771177e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.791107e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.791107e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     1.876075 sec
+     5,022,531,784      cycles                           #    2.673 GHz                       
+    12,559,680,227      instructions                     #    2.50  insn per cycle            
+       1.880203925 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10538) (512y:   54) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.686685e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.698350e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.698350e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.459028 sec
+     4,208,203,803      cycles                           #    1.709 GHz                       
+     6,429,086,120      instructions                     #    1.53  insn per cycle            
+       2.463275806 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1980) (512y:   70) (512z: 9398)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
index bc0987eea5..afbbcacb7a 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,272 +10,216 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:39:37
+DATE: 2025-10-11_16:31:19
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.954093e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.263620e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.263620e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.849435e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.385880e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.385880e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.526732 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,227,837,882      cycles                           #    2.883 GHz                    
-     3,476,505,124      instructions                     #    1.56  insn per cycle         
-       0.832118305 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+TOTAL       :     0.489334 sec
+     2,114,311,442      cycles                           #    2.842 GHz                       
+     3,127,238,641      instructions                     #    1.48  insn per cycle            
+       0.800689166 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
-.........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.643761e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.124122e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.124122e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.301805 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    10,501,615,955      cycles                           #    2.935 GHz                    
-    23,489,948,913      instructions                     #    2.24  insn per cycle         
-       3.634545913 seconds time elapsed
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158122E-004
+Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.879294e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.880182e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.880182e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.851000e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.851887e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.851887e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.737845 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    25,658,913,414      cycles                           #    2.936 GHz                    
-    78,963,594,343      instructions                     #    3.08  insn per cycle         
-       8.742435740 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.871032 sec
+    25,693,998,933      cycles                           #    2.896 GHz                       
+    78,573,360,631      instructions                     #    3.06  insn per cycle            
+       8.875307913 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141133E-004
 Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.518464e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.521735e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.521735e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.388018e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.391044e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.391044e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.671849 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    13,102,544,659      cycles                           #    2.802 GHz                    
-    39,572,381,519      instructions                     #    3.02  insn per cycle         
-       4.676455621 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.851540 sec
+    13,088,956,582      cycles                           #    2.696 GHz                       
+    39,603,859,010      instructions                     #    3.03  insn per cycle            
+       4.856264549 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141122E-004
 Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.057114e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.073561e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.073561e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.795496e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.810972e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.810972e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.046600 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     5,627,314,455      cycles                           #    2.744 GHz                    
-    13,834,298,777      instructions                     #    2.46  insn per cycle         
-       2.051219882 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11520) (512y:    0) (512z:    0)
+TOTAL       :     2.115018 sec
+     5,684,762,872      cycles                           #    2.683 GHz                       
+    13,871,040,440      instructions                     #    2.44  insn per cycle            
+       2.119380961 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11552) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.239341e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.261385e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.261385e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.855184e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.876301e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.876301e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.786219 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,931,565,389      cycles                           #    2.756 GHz                    
-    12,515,991,121      instructions                     #    2.54  insn per cycle         
-       1.790909503 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10439) (512y:   89) (512z:    0)
+TOTAL       :     1.862992 sec
+     5,028,827,648      cycles                           #    2.694 GHz                       
+    12,567,491,832      instructions                     #    2.50  insn per cycle            
+       1.867563931 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10538) (512y:   54) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.038188e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.051446e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.051446e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.712981e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.724915e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.724915e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.341272 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,150,945,217      cycles                           #    1.770 GHz                    
-     6,403,675,117      instructions                     #    1.54  insn per cycle         
-       2.345955468 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1978) (512y:  101) (512z: 9386)
+TOTAL       :     2.454832 sec
+     4,213,905,835      cycles                           #    1.714 GHz                       
+     6,436,340,551      instructions                     #    1.53  insn per cycle            
+       2.459274611 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1980) (512y:   70) (512z: 9398)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
index be31042fc1..d4d5e2b45e 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:50:33
+DATE: 2025-10-11_16:44:57
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.314159e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.339458e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.341417e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.369462e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.419383e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.422637e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     0.518126 sec
-INFO: No Floating Point Exceptions have been reported
-     2,164,802,026      cycles                           #    2.881 GHz                    
-     3,409,915,390      instructions                     #    1.58  insn per cycle         
-       0.811338657 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.487281 sec
+     2,090,605,611      cycles                           #    2.842 GHz                       
+     3,063,541,899      instructions                     #    1.47  insn per cycle            
+       0.797172689 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.134613e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.165487e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.166746e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 1.252232e+02 +- 1.234346e+02 )  GeV^-4
-TOTAL       :     3.140406 sec
-INFO: No Floating Point Exceptions have been reported
-     9,973,053,404      cycles                           #    2.934 GHz                    
-    20,986,544,572      instructions                     #    2.10  insn per cycle         
-       3.455765313 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158122E-004
+Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.884135e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.885033e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.885033e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.849332e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.850241e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.850241e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     8.712529 sec
-INFO: No Floating Point Exceptions have been reported
-    25,691,717,185      cycles                           #    2.948 GHz                    
-    78,960,325,856      instructions                     #    3.07  insn per cycle         
-       8.716734440 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.876225 sec
+    25,662,776,506      cycles                           #    2.890 GHz                       
+    78,567,147,731      instructions                     #    3.06  insn per cycle            
+       8.880187224 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141133E-004
 Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.543458e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.546697e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.546697e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.358067e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.361108e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.361108e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     4.636367 sec
-INFO: No Floating Point Exceptions have been reported
-    13,067,183,546      cycles                           #    2.816 GHz                    
-    39,558,454,763      instructions                     #    3.03  insn per cycle         
-       4.640590687 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.892312 sec
+    13,068,286,128      cycles                           #    2.669 GHz                       
+    39,590,526,259      instructions                     #    3.03  insn per cycle            
+       4.896571237 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141122E-004
 Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.084806e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.101064e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.101064e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.827564e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.843333e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.843333e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     2.036679 sec
-INFO: No Floating Point Exceptions have been reported
-     5,613,470,524      cycles                           #    2.752 GHz                    
-    13,823,796,455      instructions                     #    2.46  insn per cycle         
-       2.040900437 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11520) (512y:    0) (512z:    0)
+TOTAL       :     2.103410 sec
+     5,668,034,580      cycles                           #    2.691 GHz                       
+    13,860,472,796      instructions                     #    2.45  insn per cycle            
+       2.107462678 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11552) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.198723e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.219905e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.219905e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.833416e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.853413e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.853413e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.791160 sec
-INFO: No Floating Point Exceptions have been reported
-     4,922,288,820      cycles                           #    2.743 GHz                    
-    12,503,388,745      instructions                     #    2.54  insn per cycle         
-       1.795321275 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10439) (512y:   89) (512z:    0)
+TOTAL       :     1.864637 sec
+     5,021,320,374      cycles                           #    2.689 GHz                       
+    12,554,612,891      instructions                     #    2.50  insn per cycle            
+       1.868702414 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10538) (512y:   54) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.975365e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.987686e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.987686e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.674295e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.686265e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.686265e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     2.359532 sec
-INFO: No Floating Point Exceptions have been reported
-     4,155,009,705      cycles                           #    1.759 GHz                    
-     6,390,945,346      instructions                     #    1.54  insn per cycle         
-       2.363732897 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1978) (512y:  101) (512z: 9386)
+TOTAL       :     2.465332 sec
+     4,203,800,820      cycles                           #    1.703 GHz                       
+     6,422,604,226      instructions                     #    1.53  insn per cycle            
+       2.469400350 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1980) (512y:   70) (512z: 9398)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
index 437b6b7cbd..2beaf322b6 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:47:41
+DATE: 2025-10-11_16:41:27
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.310053e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.334627e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.336677e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.390277e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.431631e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.434858e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.518612 sec
-INFO: No Floating Point Exceptions have been reported
-     2,156,837,380      cycles                           #    2.875 GHz                    
-     3,433,389,555      instructions                     #    1.59  insn per cycle         
-       0.811650542 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.485227 sec
+     2,088,179,344      cycles                           #    2.833 GHz                       
+     3,069,782,317      instructions                     #    1.47  insn per cycle            
+       0.797220882 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.128944e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.159258e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.160487e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.091523 sec
-INFO: No Floating Point Exceptions have been reported
-     9,825,563,648      cycles                           #    2.933 GHz                    
-    22,802,776,931      instructions                     #    2.32  insn per cycle         
-       3.405923259 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158122E-004
+Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.890035e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.890938e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.890938e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.841686e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.842564e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.842564e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.683864 sec
-INFO: No Floating Point Exceptions have been reported
-    25,635,022,031      cycles                           #    2.951 GHz                    
-    78,960,809,140      instructions                     #    3.08  insn per cycle         
-       8.688143049 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.911703 sec
+    25,672,385,298      cycles                           #    2.880 GHz                       
+    78,567,422,772      instructions                     #    3.06  insn per cycle            
+       8.915910048 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141133E-004
 Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.535619e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.538805e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.538805e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.377610e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.380670e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.380670e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.644682 sec
-INFO: No Floating Point Exceptions have been reported
-    13,070,212,228      cycles                           #    2.812 GHz                    
-    39,558,910,913      instructions                     #    3.03  insn per cycle         
-       4.648863484 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.861995 sec
+    13,083,483,284      cycles                           #    2.689 GHz                       
+    39,590,790,279      instructions                     #    3.03  insn per cycle            
+       4.866021467 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141122E-004
 Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.974136e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.989764e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.989764e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.782247e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.797307e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.797307e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.062978 sec
-INFO: No Floating Point Exceptions have been reported
-     5,609,565,523      cycles                           #    2.715 GHz                    
-    13,823,736,601      instructions                     #    2.46  insn per cycle         
-       2.067208066 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11520) (512y:    0) (512z:    0)
+TOTAL       :     2.113995 sec
+     5,648,509,407      cycles                           #    2.668 GHz                       
+    13,860,950,299      instructions                     #    2.45  insn per cycle            
+       2.118130954 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11552) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.256862e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.278276e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.278276e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.815640e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.835781e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.835781e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.778135 sec
-INFO: No Floating Point Exceptions have been reported
-     4,913,104,520      cycles                           #    2.758 GHz                    
-    12,505,156,898      instructions                     #    2.55  insn per cycle         
-       1.782374042 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10439) (512y:   89) (512z:    0)
+TOTAL       :     1.866689 sec
+     5,013,333,127      cycles                           #    2.681 GHz                       
+    12,556,528,301      instructions                     #    2.50  insn per cycle            
+       1.870730508 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10538) (512y:   54) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.040533e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.053211e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.053211e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.601628e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.612890e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.612890e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.335968 sec
-INFO: No Floating Point Exceptions have been reported
-     4,137,289,106      cycles                           #    1.769 GHz                    
-     6,392,511,975      instructions                     #    1.55  insn per cycle         
-       2.340416062 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1978) (512y:  101) (512z: 9386)
+TOTAL       :     2.490563 sec
+     4,200,883,402      cycles                           #    1.685 GHz                       
+     6,425,171,149      instructions                     #    1.53  insn per cycle            
+       2.494555434 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1980) (512y:   70) (512z: 9398)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..2815ba1af8
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_16:50:33
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 4.400466e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.444219e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.447053e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     0.504359 sec
+     2,085,179,396      cycles                           #    2.830 GHz                       
+     3,096,904,235      instructions                     #    1.49  insn per cycle            
+       0.798389923 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626675e-04
+Avg ME (F77/GPU)   = 6.6266731198158122E-004
+Relative difference = 2.837296513854949e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.851668e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.852556e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.852556e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     8.863632 sec
+    25,676,607,785      cycles                           #    2.896 GHz                       
+    78,566,655,326      instructions                     #    3.06  insn per cycle            
+       8.867760313 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198141133E-004
+Relative difference = 2.8372990776517314e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.364733e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.367766e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.367766e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     4.880672 sec
+    13,087,360,743      cycles                           #    2.680 GHz                       
+    39,590,709,537      instructions                     #    3.03  insn per cycle            
+       4.884841575 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198141122E-004
+Relative difference = 2.837299079287849e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 7.891642e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.907720e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.907720e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.084604 sec
+     5,646,655,758      cycles                           #    2.704 GHz                       
+    13,860,514,996      instructions                     #    2.45  insn per cycle            
+       2.088799789 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11552) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.832886e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.853061e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.853061e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     1.862981 sec
+     5,001,186,272      cycles                           #    2.680 GHz                       
+    12,556,644,714      instructions                     #    2.51  insn per cycle            
+       1.867187074 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10538) (512y:   54) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.594055e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.605629e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.605629e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.493451 sec
+     4,195,828,592      cycles                           #    1.681 GHz                       
+     6,424,665,239      instructions                     #    1.53  insn per cycle            
+       2.497646028 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1980) (512y:   70) (512z: 9398)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
index f2b15e4b6f..0158323c78 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:44:55
+DATE: 2025-10-11_16:38:00
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.041462e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.325366e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.327398e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.928428e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.433382e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.436767e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.520118 sec
-INFO: No Floating Point Exceptions have been reported
-     2,177,158,293      cycles                           #    2.891 GHz                    
-     3,464,316,990      instructions                     #    1.59  insn per cycle         
-       0.812097316 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst
-WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.486860 sec
+     2,086,798,241      cycles                           #    2.826 GHz                       
+     3,070,254,605      instructions                     #    1.47  insn per cycle            
+       0.797700561 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst OMP=
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.734798e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.174453e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.175668e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.213650 sec
-INFO: No Floating Point Exceptions have been reported
-    10,150,922,529      cycles                           #    2.918 GHz                    
-    23,231,659,490      instructions                     #    2.29  insn per cycle         
-       3.538737264 seconds time elapsed
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158122E-004
+Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.885407e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.886309e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.886309e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.846748e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.847641e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.847641e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.705137 sec
-INFO: No Floating Point Exceptions have been reported
-    25,650,530,800      cycles                           #    2.946 GHz                    
-    78,960,008,246      instructions                     #    3.08  insn per cycle         
-       8.709419634 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.887132 sec
+    25,658,141,408      cycles                           #    2.886 GHz                       
+    78,568,113,694      instructions                     #    3.06  insn per cycle            
+       8.891273835 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141133E-004
 Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.551750e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.554937e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.554937e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.370014e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.373021e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.373021e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.623453 sec
-INFO: No Floating Point Exceptions have been reported
-    13,056,946,389      cycles                           #    2.822 GHz                    
-    39,559,090,760      instructions                     #    3.03  insn per cycle         
-       4.627712527 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.872933 sec
+    13,079,305,653      cycles                           #    2.683 GHz                       
+    39,591,036,555      instructions                     #    3.03  insn per cycle            
+       4.877066552 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141122E-004
 Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.090893e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.106933e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.106933e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.876108e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.892295e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.892295e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.033338 sec
-INFO: No Floating Point Exceptions have been reported
-     5,609,780,879      cycles                           #    2.754 GHz                    
-    13,824,722,765      instructions                     #    2.46  insn per cycle         
-       2.037509617 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11520) (512y:    0) (512z:    0)
+TOTAL       :     2.088702 sec
+     5,640,399,522      cycles                           #    2.696 GHz                       
+    13,860,298,624      instructions                     #    2.46  insn per cycle            
+       2.092763612 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11552) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.188897e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.209893e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.209893e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.890465e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.910782e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.910782e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.791081 sec
-INFO: No Floating Point Exceptions have been reported
-     4,916,057,270      cycles                           #    2.740 GHz                    
-    12,505,186,935      instructions                     #    2.54  insn per cycle         
-       1.795355106 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10439) (512y:   89) (512z:    0)
+TOTAL       :     1.851027 sec
+     4,999,453,261      cycles                           #    2.696 GHz                       
+    12,556,321,373      instructions                     #    2.51  insn per cycle            
+       1.855011471 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10538) (512y:   54) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.019116e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.031683e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.031683e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.623877e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.635346e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.635346e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.343107 sec
-INFO: No Floating Point Exceptions have been reported
-     4,136,898,273      cycles                           #    1.763 GHz                    
-     6,392,336,539      instructions                     #    1.55  insn per cycle         
-       2.347534329 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1978) (512y:  101) (512z: 9386)
+TOTAL       :     2.482437 sec
+     4,198,161,225      cycles                           #    1.689 GHz                       
+     6,424,537,434      instructions                     #    1.53  insn per cycle            
+       2.486588561 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1980) (512y:   70) (512z: 9398)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
index 99e413a8a3..f41a7b9938 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:03:14
+DATE: 2025-10-11_15:24:03
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.332738e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.357821e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.359802e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.429377e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.477740e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.480923e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.542209 sec
-INFO: No Floating Point Exceptions have been reported
-     2,220,139,727      cycles                           #    2.875 GHz                    
-     3,465,138,857      instructions                     #    1.56  insn per cycle         
-       0.835706398 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.500889 sec
+     2,161,311,557      cycles                           #    2.855 GHz                       
+     3,140,076,215      instructions                     #    1.45  insn per cycle            
+       0.823418290 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.145716e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.176488e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.177708e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.039240 sec
-INFO: No Floating Point Exceptions have been reported
-     9,630,090,535      cycles                           #    2.918 GHz                    
-    21,945,170,652      instructions                     #    2.28  insn per cycle         
-       3.356721463 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158122E-004
+Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.881580e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.882499e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.882499e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.849400e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.850323e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.850323e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.723377 sec
-INFO: No Floating Point Exceptions have been reported
-    25,611,709,249      cycles                           #    2.935 GHz                    
-    78,703,444,126      instructions                     #    3.07  insn per cycle         
-       8.727502935 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4191) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.874198 sec
+    25,611,778,767      cycles                           #    2.885 GHz                       
+    78,652,591,485      instructions                     #    3.07  insn per cycle            
+       8.878147244 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141133E-004
 Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.593581e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.596889e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.596889e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.379484e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.382464e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.382464e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.571814 sec
-INFO: No Floating Point Exceptions have been reported
-    13,039,592,628      cycles                           #    2.851 GHz                    
-    39,453,086,877      instructions                     #    3.03  insn per cycle         
-       4.575893049 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:12966) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.859162 sec
+    13,089,109,626      cycles                           #    2.692 GHz                       
+    39,515,404,087      instructions                     #    3.02  insn per cycle            
+       4.863216879 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13022) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141122E-004
 Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.986878e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.003760e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.003760e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.837369e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.853285e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.853285e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.061484 sec
-INFO: No Floating Point Exceptions have been reported
-     5,673,128,561      cycles                           #    2.749 GHz                    
-    13,911,820,426      instructions                     #    2.45  insn per cycle         
-       2.066505881 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11582) (512y:    0) (512z:    0)
+TOTAL       :     2.098643 sec
+     5,677,190,930      cycles                           #    2.701 GHz                       
+    13,961,575,914      instructions                     #    2.46  insn per cycle            
+       2.102810449 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11630) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.098916e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.119150e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.119150e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.705091e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.724821e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.724821e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.809563 sec
-INFO: No Floating Point Exceptions have been reported
-     4,990,015,585      cycles                           #    2.753 GHz                    
-    12,604,471,256      instructions                     #    2.53  insn per cycle         
-       1.813650628 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10423) (512y:  241) (512z:    0)
+TOTAL       :     1.889961 sec
+     5,055,738,073      cycles                           #    2.670 GHz                       
+    12,659,664,704      instructions                     #    2.50  insn per cycle            
+       1.894052230 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10483) (512y:  226) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.910207e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.922434e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.922434e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.677757e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.689492e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.689492e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.380650 sec
-INFO: No Floating Point Exceptions have been reported
-     4,192,440,259      cycles                           #    1.759 GHz                    
-     6,502,191,985      instructions                     #    1.55  insn per cycle         
-       2.384674618 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1754) (512y:  193) (512z: 9382)
+TOTAL       :     2.462163 sec
+     4,206,188,103      cycles                           #    1.706 GHz                       
+     6,542,388,485      instructions                     #    1.56  insn per cycle            
+       2.466313710 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1764) (512y:  185) (512z: 9379)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
index 76362e2777..b05fc67f3a 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:30:00
+DATE: 2025-10-11_16:20:09
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.108959e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.129301e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.130870e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.059658e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.097347e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.099827e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.545749 sec
-INFO: No Floating Point Exceptions have been reported
-     2,205,865,001      cycles                           #    2.840 GHz                    
-     3,412,138,367      instructions                     #    1.55  insn per cycle         
-       0.835130533 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.501512 sec
+     2,120,097,032      cycles                           #    2.815 GHz                       
+     3,067,817,522      instructions                     #    1.45  insn per cycle            
+       0.823770320 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.747537e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.771352e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.772362e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.317305 sec
-INFO: No Floating Point Exceptions have been reported
-    10,470,225,400      cycles                           #    2.928 GHz                    
-    22,893,642,046      instructions                     #    2.19  insn per cycle         
-       3.632348979 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
 Avg ME (F77/GPU)   = 6.6266731198158122E-004
 Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.279433e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.279917e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.279917e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.202543e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.203008e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.203008e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    38.330200 sec
-INFO: No Floating Point Exceptions have been reported
-   112,786,835,820      cycles                           #    2.943 GHz                    
-   144,812,254,859      instructions                     #    1.28  insn per cycle         
-      38.334547107 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:21273) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    39.031219 sec
+   112,588,276,317      cycles                           #    2.885 GHz                       
+   142,621,877,493      instructions                     #    1.27  insn per cycle            
+      39.035229334 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20355) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198140461E-004
 Relative difference = 2.8372991790910424e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.132336e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.134792e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.134792e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.909352e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.911559e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.911559e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.242571 sec
-INFO: No Floating Point Exceptions have been reported
-    14,761,048,074      cycles                           #    2.814 GHz                    
-    37,609,615,991      instructions                     #    2.55  insn per cycle         
-       5.246531710 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:68172) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.643908 sec
+    15,024,056,162      cycles                           #    2.661 GHz                       
+    37,385,323,408      instructions                     #    2.49  insn per cycle            
+       5.648271623 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:67523) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141209E-004
 Relative difference = 2.8372990661989057e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.367426e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.381363e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.381363e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.457222e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.471736e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.471736e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.233268 sec
-INFO: No Floating Point Exceptions have been reported
-     6,121,196,467      cycles                           #    2.737 GHz                    
-    13,054,881,187      instructions                     #    2.13  insn per cycle         
-       2.237420808 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:46946) (512y:    0) (512z:    0)
+TOTAL       :     2.205981 sec
+     5,946,476,110      cycles                           #    2.692 GHz                       
+    12,809,216,170      instructions                     #    2.15  insn per cycle            
+       2.210041352 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:45792) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198156789E-004
 Relative difference = 2.837296715097453e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.964974e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.985321e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.985321e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.156302e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.178569e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.178569e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.836637 sec
-INFO: No Floating Point Exceptions have been reported
-     5,064,709,437      cycles                           #    2.753 GHz                    
-    11,452,008,336      instructions                     #    2.26  insn per cycle         
-       1.840705951 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:40486) (512y:  285) (512z:    0)
+TOTAL       :     1.797567 sec
+     4,817,758,417      cycles                           #    2.675 GHz                       
+    11,422,908,794      instructions                     #    2.37  insn per cycle            
+       1.801731550 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:40102) (512y:  282) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198156789E-004
 Relative difference = 2.837296715097453e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.358991e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.372760e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.372760e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.936851e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.949204e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.949204e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.235964 sec
-INFO: No Floating Point Exceptions have been reported
-     3,956,538,826      cycles                           #    1.767 GHz                    
-     5,928,749,634      instructions                     #    1.50  insn per cycle         
-       2.240037452 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2444) (512y:  337) (512z:39338)
+TOTAL       :     2.370929 sec
+     4,028,743,609      cycles                           #    1.697 GHz                       
+     5,966,081,307      instructions                     #    1.48  insn per cycle            
+       2.375198937 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2453) (512y:  337) (512z:39235)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198156789E-004
 Relative difference = 2.837296715097453e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
index 5040f4b335..10c6792da9 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:31:09
+DATE: 2025-10-11_16:21:27
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.107076e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.130192e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.131670e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.079972e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.118608e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.121448e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.539226 sec
-INFO: No Floating Point Exceptions have been reported
-     2,240,615,938      cycles                           #    2.902 GHz                    
-     3,467,491,001      instructions                     #    1.55  insn per cycle         
-       0.828466018 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.505348 sec
+     2,147,536,542      cycles                           #    2.834 GHz                       
+     3,073,502,942      instructions                     #    1.43  insn per cycle            
+       0.816880103 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.751881e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.775679e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.776668e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.303070 sec
-INFO: No Floating Point Exceptions have been reported
-    10,434,569,638      cycles                           #    2.930 GHz                    
-    24,118,235,140      instructions                     #    2.31  insn per cycle         
-       3.617886016 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
 Avg ME (F77/GPU)   = 6.6266731198158122E-004
 Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.241409e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.241886e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.241886e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.177605e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.178066e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.178066e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    38.674103 sec
-INFO: No Floating Point Exceptions have been reported
-   113,958,477,984      cycles                           #    2.947 GHz                    
-   144,286,195,418      instructions                     #    1.27  insn per cycle         
-      38.678088373 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:21024) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    39.263371 sec
+   113,104,353,359      cycles                           #    2.881 GHz                       
+   142,499,000,297      instructions                     #    1.26  insn per cycle            
+      39.267518963 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20686) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198140450E-004
-Relative difference = 2.83729918072716e-07
+Avg ME (F77/C++)    = 6.6266731198140461E-004
+Relative difference = 2.8372991790910424e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.007169e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.009483e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.009483e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.978578e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.980900e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.980900e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.460584 sec
-INFO: No Floating Point Exceptions have been reported
-    15,281,187,875      cycles                           #    2.797 GHz                    
-    37,839,169,102      instructions                     #    2.48  insn per cycle         
-       5.464853538 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:68594) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.512347 sec
+    14,738,984,303      cycles                           #    2.672 GHz                       
+    37,383,415,891      instructions                     #    2.54  insn per cycle            
+       5.516366576 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:67498) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198141209E-004
 Relative difference = 2.8372990661989057e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.567317e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.582163e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.582163e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.475575e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.489872e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.489872e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.174218 sec
-INFO: No Floating Point Exceptions have been reported
-     6,020,206,289      cycles                           #    2.765 GHz                    
-    12,923,983,464      instructions                     #    2.15  insn per cycle         
-       2.178219828 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:46048) (512y:    0) (512z:    0)
+TOTAL       :     2.200089 sec
+     5,900,324,656      cycles                           #    2.678 GHz                       
+    12,761,113,056      instructions                     #    2.16  insn per cycle            
+       2.204163616 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:45170) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198156789E-004
 Relative difference = 2.837296715097453e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.900478e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.920792e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.920792e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.197126e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.219484e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.219484e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.849478 sec
-INFO: No Floating Point Exceptions have been reported
-     5,102,330,026      cycles                           #    2.754 GHz                    
-    11,453,366,172      instructions                     #    2.24  insn per cycle         
-       1.853513717 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:40151) (512y:  219) (512z:    0)
+TOTAL       :     1.789159 sec
+     4,800,966,323      cycles                           #    2.679 GHz                       
+    11,387,516,470      instructions                     #    2.37  insn per cycle            
+       1.793280010 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:39634) (512y:  220) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198156789E-004
 Relative difference = 2.837296715097453e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.368242e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.382314e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.382314e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.918624e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.931258e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.931258e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.232876 sec
-INFO: No Floating Point Exceptions have been reported
-     3,951,515,189      cycles                           #    1.767 GHz                    
-     5,896,746,544      instructions                     #    1.49  insn per cycle         
-       2.236852257 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1959) (512y:  259) (512z:38977)
+TOTAL       :     2.376650 sec
+     4,022,990,522      cycles                           #    1.691 GHz                       
+     5,935,742,762      instructions                     #    1.48  insn per cycle            
+       2.380804465 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1962) (512y:  259) (512z:38890)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198156789E-004
 Relative difference = 2.837296715097453e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..66df8ea815
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_15:43:39
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+4.135255e+05    1 256
+5.793061e+05    2 256
+6.367973e+05    4 256
+7.358963e+05    8 256
+7.953962e+05   16 256
+8.026621e+05   32 256
+8.113874e+05   64 256
+8.126232e+05  128 256
+8.151724e+05  256 256
+8.388200e+05  512 256
+8.795025e+05 1024 256
+### GPU: scaling test 32
+5.987397e+04    1  32
+1.082531e+05    2  32
+2.101123e+05    4  32
+2.737883e+05    8  32
+5.126747e+05   16  32
+6.967787e+05   32  32
+7.376223e+05   64  32
+7.871564e+05  128  32
+8.121480e+05  256  32
+8.130411e+05  512  32
+8.134619e+05 1024  32
+8.204307e+05 2048  32
+8.423180e+05 4096  32
+8.883516e+05 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.920624e+03    1 256
+1.925794e+03    2 256
+1.919663e+03    4 256
+### CPU: scaling test 32
+1.889651e+03    1  32
+1.920077e+03    2  32
+1.912129e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.748798e+03    1 256
+6.810960e+03    2 256
+6.802786e+03    4 256
+### CPU: scaling test 32
+6.554707e+03    1  32
+6.688739e+03    2  32
+6.725225e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.524095e+04    1 256
+1.526644e+04    2 256
+1.569761e+04    4 256
+### CPU: scaling test 32
+1.566123e+04    1  32
+1.560506e+04    2  32
+1.523576e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.747918e+04    1 256
+1.758742e+04    2 256
+1.773825e+04    4 256
+### CPU: scaling test 32
+1.691546e+04    1  32
+1.701187e+04    2  32
+1.740175e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.350824e+04    1 256
+1.356994e+04    2 256
+1.370361e+04    4 256
+### CPU: scaling test 32
+1.321355e+04    1  32
+1.322154e+04    2  32
+1.321729e+04    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index c4676334b0..edf11bdd4c 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:04:57
+DATE: 2025-10-11_15:26:12
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.476973e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.519601e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.523500e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.969754e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.061645e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.069860e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.498075 sec
-INFO: No Floating Point Exceptions have been reported
-     2,049,620,143      cycles                           #    2.856 GHz                    
-     3,058,097,989      instructions                     #    1.49  insn per cycle         
-       0.977244524 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.480574 sec
+     2,060,773,811      cycles                           #    2.817 GHz                       
+     2,941,122,949      instructions                     #    1.43  insn per cycle            
+       0.791153613 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.124860e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.187008e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.189727e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.797790 sec
-INFO: No Floating Point Exceptions have been reported
-     5,916,497,978      cycles                           #    2.910 GHz                    
-    12,115,730,956      instructions                     #    2.05  insn per cycle         
-       2.090370837 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626455e-04
+Avg ME (F77/GPU)   = 6.6262665411373489E-004
+Relative difference = 2.8440374627264284e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.932981e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.933931e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.933931e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.903278e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.904203e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.904203e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.490769 sec
-INFO: No Floating Point Exceptions have been reported
-    24,922,868,630      cycles                           #    2.935 GHz                    
-    79,110,265,707      instructions                     #    3.17  insn per cycle         
-       8.496015758 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.622014 sec
+    25,008,733,138      cycles                           #    2.900 GHz                       
+    79,110,262,561      instructions                     #    3.16  insn per cycle            
+       8.625952005 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274863312764526E-004
-Relative difference = 4.998523613136231e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865450727943E-004
+Relative difference = 6.864248936772735e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.975543e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.988298e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.988298e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.866781e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.879439e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.879439e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.356100 sec
-INFO: No Floating Point Exceptions have been reported
-     6,536,263,436      cycles                           #    2.771 GHz                    
-    20,271,266,485      instructions                     #    3.10  insn per cycle         
-       2.362378155 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.393369 sec
+     6,521,051,461      cycles                           #    2.721 GHz                       
+    20,285,887,455      instructions                     #    3.11  insn per cycle            
+       2.397558323 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
 Avg ME (F77/C++)    = 6.6274861442972011E-004
 Relative difference = 2.1772539563413118e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.588631e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.595153e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.595153e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.574802e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.581515e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.581515e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.038490 sec
-INFO: No Floating Point Exceptions have been reported
-     2,837,721,779      cycles                           #    2.726 GHz                    
-     7,066,858,765      instructions                     #    2.49  insn per cycle         
-       1.044464831 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12055) (512y:    0) (512z:    0)
+TOTAL       :     1.046468 sec
+     2,851,964,901      cycles                           #    2.717 GHz                       
+     7,084,391,235      instructions                     #    2.48  insn per cycle            
+       1.050530428 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12085) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271938174396888E-004
 Relative difference = 2.7547150614455683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.762421e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.770702e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.770702e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.745784e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.753552e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.753552e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.936394 sec
-INFO: No Floating Point Exceptions have been reported
-     2,577,125,275      cycles                           #    2.745 GHz                    
-     6,404,206,024      instructions                     #    2.49  insn per cycle         
-       0.941322355 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11019) (512y:   44) (512z:    0)
+TOTAL       :     0.944326 sec
+     2,540,352,407      cycles                           #    2.681 GHz                       
+     6,429,340,698      instructions                     #    2.53  insn per cycle            
+       0.948183906 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11116) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271938174396888E-004
 Relative difference = 2.7547150614455683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.409980e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.415034e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.415034e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.337094e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.341815e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.341815e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.170914 sec
-INFO: No Floating Point Exceptions have been reported
-     2,069,436,546      cycles                           #    1.766 GHz                    
-     3,304,699,013      instructions                     #    1.60  insn per cycle         
-       1.174781391 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2603) (512y:   44) (512z: 9605)
+TOTAL       :     1.231615 sec
+     2,100,593,891      cycles                           #    1.701 GHz                       
+     3,321,026,364      instructions                     #    1.58  insn per cycle            
+       1.235667181 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   14) (512z: 9619)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
 Avg ME (F77/C++)    = 6.6271952779718007E-004
 Relative difference = 4.194411063934945e-08
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..ef0c8bca55
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_16:00:32
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.335389e+05    1 256
+3.586592e+05    2 256
+4.818891e+05    4 256
+5.593817e+05    8 256
+6.056925e+05   16 256
+6.276955e+05   32 256
+6.367619e+05   64 256
+6.473110e+05  128 256
+6.476010e+05  256 256
+6.505009e+05  512 256
+6.687069e+05 1024 256
+### GPU: scaling test 32
+3.216908e+04    1  32
+6.168033e+04    2  32
+1.180476e+05    4  32
+1.918642e+05    8  32
+3.068465e+05   16  32
+4.811781e+05   32  32
+5.662467e+05   64  32
+6.060356e+05  128  32
+6.424836e+05  256  32
+6.336577e+05  512  32
+6.477611e+05 1024  32
+6.516195e+05 2048  32
+6.509793e+05 4096  32
+6.718523e+05 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.906133e+03    1 256
+1.895289e+03    2 256
+1.894897e+03    4 256
+### CPU: scaling test 32
+1.889460e+03    1  32
+1.885630e+03    2  32
+1.887908e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.645424e+03    1 256
+6.741425e+03    2 256
+6.801857e+03    4 256
+### CPU: scaling test 32
+6.523685e+03    1  32
+6.609563e+03    2  32
+6.739293e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.544354e+04    1 256
+1.568938e+04    2 256
+1.565635e+04    4 256
+### CPU: scaling test 32
+1.473739e+04    1  32
+1.556619e+04    2  32
+1.562139e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.746432e+04    1 256
+1.767402e+04    2 256
+1.746961e+04    4 256
+### CPU: scaling test 32
+1.748124e+04    1  32
+1.594924e+04    2  32
+1.708084e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.329941e+04    1 256
+1.349011e+04    2 256
+1.344081e+04    4 256
+### CPU: scaling test 32
+1.333268e+04    1  32
+1.314999e+04    2  32
+1.325747e+04    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..701efdbc30
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_15:54:02
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 6.311490e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.371404e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.377432e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
+TOTAL       :     1.171779 sec
+     4,342,560,419      cycles                           #    2.834 GHz                       
+     5,966,664,550      instructions                     #    1.37  insn per cycle            
+       1.591397840 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626455e-04
+Avg ME (F77/GPU)   = 6.6262664623572415E-004
+Relative difference = 2.8452263353202596e-05
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.892352e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.893287e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.893287e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
+TOTAL       :     8.671691 sec
+    25,006,063,904      cycles                           #    2.883 GHz                       
+    79,110,972,034      instructions                     #    3.16  insn per cycle            
+       8.675650420 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865450727943E-004
+Relative difference = 6.864248936772735e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.783736e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.796482e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.796482e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
+TOTAL       :     2.422556 sec
+     6,525,728,187      cycles                           #    2.691 GHz                       
+    20,285,987,046      instructions                     #    3.11  insn per cycle            
+       2.426471276 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627486e-04
+Avg ME (F77/C++)    = 6.6274861442972011E-004
+Relative difference = 2.1772539563413118e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.560871e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.567340e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.567340e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
+TOTAL       :     1.055589 sec
+     2,850,961,292      cycles                           #    2.692 GHz                       
+     7,084,449,005      instructions                     #    2.48  insn per cycle            
+       1.059632714 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12085) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627194e-04
+Avg ME (F77/C++)    = 6.6271938174396888E-004
+Relative difference = 2.7547150614455683e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.733304e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.741477e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.741477e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
+TOTAL       :     0.951122 sec
+     2,540,771,004      cycles                           #    2.663 GHz                       
+     6,429,427,589      instructions                     #    2.53  insn per cycle            
+       0.954962814 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11116) (512y:    9) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627194e-04
+Avg ME (F77/C++)    = 6.6271938174396888E-004
+Relative difference = 2.7547150614455683e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.328792e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.333460e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.333460e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
+TOTAL       :     1.239447 sec
+     2,103,191,835      cycles                           #    1.693 GHz                       
+     3,321,146,945      instructions                     #    1.58  insn per cycle            
+       1.243442238 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   14) (512z: 9619)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627195e-04
+Avg ME (F77/C++)    = 6.6271952779718007E-004
+Relative difference = 4.194411063934945e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
index dec260c3af..33e9172b7c 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,272 +10,216 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:40:11
+DATE: 2025-10-11_16:32:02
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.924368e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.456718e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.456718e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.481369 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,011,468,293      cycles                           #    2.883 GHz                    
-     2,972,689,221      instructions                     #    1.48  insn per cycle         
-       0.755097926 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+EvtsPerSec[Rmb+ME]     (23) = ( 6.861766e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.949922e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.949922e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.048177e+00 +- 2.364571e+00 )  GeV^-4
+TOTAL       :     0.468518 sec
+     2,012,803,026      cycles                           #    2.822 GHz                       
+     2,875,965,208      instructions                     #    1.43  insn per cycle            
+       0.770453877 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
-.........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.978465e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.128974e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.128974e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.641709e+00 +- 4.994248e+00 )  GeV^-4
-TOTAL       :     1.967107 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,502,759,539      cycles                           #    2.928 GHz                    
-    13,854,302,325      instructions                     #    2.13  insn per cycle         
-       2.276466534 seconds time elapsed
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626455e-04
+Avg ME (F77/GPU)   = 6.6262665411373489E-004
+Relative difference = 2.8440374627264284e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.944212e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.945160e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.945160e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.893203e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.894136e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.894136e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.443358 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    24,934,407,175      cycles                           #    2.952 GHz                    
-    79,115,502,595      instructions                     #    3.17  insn per cycle         
-       8.447759712 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.670365 sec
+    25,029,663,251      cycles                           #    2.886 GHz                       
+    79,116,596,499      instructions                     #    3.16  insn per cycle            
+       8.674407204 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274863312764526E-004
-Relative difference = 4.998523613136231e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865450727943E-004
+Relative difference = 6.864248936772735e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.020230e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.033459e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.033459e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.709216e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.721522e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.721522e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.344217 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,541,090,448      cycles                           #    2.786 GHz                    
-    20,280,124,954      instructions                     #    3.10  insn per cycle         
-       2.348689069 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.452506 sec
+     6,536,185,486      cycles                           #    2.662 GHz                       
+    20,295,453,995      instructions                     #    3.11  insn per cycle            
+       2.456555328 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
 Avg ME (F77/C++)    = 6.6274861442972011E-004
 Relative difference = 2.1772539563413118e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.604920e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.611581e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.611581e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.562296e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.568810e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.568810e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.029784 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,846,767,262      cycles                           #    2.755 GHz                    
-     7,076,446,064      instructions                     #    2.49  insn per cycle         
-       1.034215836 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12055) (512y:    0) (512z:    0)
+TOTAL       :     1.057576 sec
+     2,861,881,138      cycles                           #    2.697 GHz                       
+     7,094,482,774      instructions                     #    2.48  insn per cycle            
+       1.061902735 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12085) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271938174396888E-004
 Relative difference = 2.7547150614455683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.797566e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.806224e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.806224e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.759096e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.767108e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.767108e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.920078 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,539,792,408      cycles                           #    2.749 GHz                    
-     6,413,266,409      instructions                     #    2.53  insn per cycle         
-       0.924434981 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11019) (512y:   44) (512z:    0)
+TOTAL       :     0.940293 sec
+     2,550,431,948      cycles                           #    2.703 GHz                       
+     6,439,393,273      instructions                     #    2.52  insn per cycle            
+       0.944425361 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11116) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271938174396888E-004
 Relative difference = 2.7547150614455683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.411104e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.416189e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.416189e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.351978e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.356813e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.356813e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.170311 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,078,956,436      cycles                           #    1.771 GHz                    
-     3,314,205,136      instructions                     #    1.59  insn per cycle         
-       1.174679954 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2603) (512y:   44) (512z: 9605)
+TOTAL       :     1.220874 sec
+     2,108,458,958      cycles                           #    1.722 GHz                       
+     3,331,332,180      instructions                     #    1.58  insn per cycle            
+       1.225108686 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   14) (512z: 9619)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
 Avg ME (F77/C++)    = 6.6271952779718007E-004
 Relative difference = 4.194411063934945e-08
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
index 3ebd5caeb8..2a484de798 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:51:07
+DATE: 2025-10-11_16:45:41
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.481675e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.521755e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.525865e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.975551e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.068315e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.076540e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.159396e-01 +- 3.238803e-01 )  GeV^-4
-TOTAL       :     0.477918 sec
-INFO: No Floating Point Exceptions have been reported
-     1,990,228,523      cycles                           #    2.864 GHz                    
-     2,978,927,673      instructions                     #    1.50  insn per cycle         
-       0.751663902 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.467991 sec
+     2,005,858,911      cycles                           #    2.818 GHz                       
+     2,853,662,043      instructions                     #    1.42  insn per cycle            
+       0.770358119 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.037728e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.099183e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.101846e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 1.094367e+02 +- 1.071509e+02 )  GeV^-4
-TOTAL       :     1.886731 sec
-INFO: No Floating Point Exceptions have been reported
-     6,136,710,401      cycles                           #    2.909 GHz                    
-    13,142,850,218      instructions                     #    2.14  insn per cycle         
-       2.175693489 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626455e-04
+Avg ME (F77/GPU)   = 6.6262665411373489E-004
+Relative difference = 2.8440374627264284e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.941292e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.942240e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.942240e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.892862e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.893799e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.893799e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208459e-01 +- 3.253446e-01 )  GeV^-4
-TOTAL       :     8.455097 sec
-INFO: No Floating Point Exceptions have been reported
-    24,914,950,228      cycles                           #    2.946 GHz                    
-    79,111,045,664      instructions                     #    3.18  insn per cycle         
-       8.459383915 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.670204 sec
+    25,024,619,872      cycles                           #    2.885 GHz                       
+    79,109,507,524      instructions                     #    3.16  insn per cycle            
+       8.674082417 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274863312764526E-004
-Relative difference = 4.998523613136231e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865450727943E-004
+Relative difference = 6.864248936772735e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.977213e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.990041e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.990041e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.794380e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.806787e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.806787e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208457e-01 +- 3.253445e-01 )  GeV^-4
-TOTAL       :     2.356205 sec
-INFO: No Floating Point Exceptions have been reported
-     6,550,546,250      cycles                           #    2.776 GHz                    
-    20,269,237,886      instructions                     #    3.09  insn per cycle         
-       2.360272003 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.419819 sec
+     6,522,870,130      cycles                           #    2.692 GHz                       
+    20,284,313,479      instructions                     #    3.11  insn per cycle            
+       2.423616462 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
 Avg ME (F77/C++)    = 6.6274861442972011E-004
 Relative difference = 2.1772539563413118e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.601317e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.608084e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.608084e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.559254e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.565757e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.565757e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     1.030095 sec
-INFO: No Floating Point Exceptions have been reported
-     2,839,431,727      cycles                           #    2.748 GHz                    
-     7,063,774,184      instructions                     #    2.49  insn per cycle         
-       1.034210988 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12055) (512y:    0) (512z:    0)
+TOTAL       :     1.057643 sec
+     2,858,106,356      cycles                           #    2.694 GHz                       
+     7,082,027,901      instructions                     #    2.48  insn per cycle            
+       1.061594009 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12085) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271938174396888E-004
 Relative difference = 2.7547150614455683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.801735e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.810193e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.810193e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.732036e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.739945e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.739945e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     0.916264 sec
-INFO: No Floating Point Exceptions have been reported
-     2,529,614,240      cycles                           #    2.751 GHz                    
-     6,399,972,746      instructions                     #    2.53  insn per cycle         
-       0.920311559 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11019) (512y:   44) (512z:    0)
+TOTAL       :     0.953431 sec
+     2,543,753,776      cycles                           #    2.660 GHz                       
+     6,427,635,361      instructions                     #    2.53  insn per cycle            
+       0.957126756 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11116) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271938174396888E-004
 Relative difference = 2.7547150614455683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.413582e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.418711e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.418711e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.349101e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.354028e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.354028e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214981e-01 +- 3.255523e-01 )  GeV^-4
-TOTAL       :     1.166574 sec
-INFO: No Floating Point Exceptions have been reported
-     2,070,023,042      cycles                           #    1.769 GHz                    
-     3,300,470,940      instructions                     #    1.59  insn per cycle         
-       1.170621524 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2603) (512y:   44) (512z: 9605)
+TOTAL       :     1.221899 sec
+     2,101,668,726      cycles                           #    1.716 GHz                       
+     3,317,393,025      instructions                     #    1.58  insn per cycle            
+       1.225868499 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   14) (512z: 9619)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
 Avg ME (F77/C++)    = 6.6271952779718007E-004
 Relative difference = 4.194411063934945e-08
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
index 8aa78a916d..9f5f8217b1 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:48:16
+DATE: 2025-10-11_16:42:10
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.460370e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.501314e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.505347e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.971986e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.070136e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.083717e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.475676 sec
-INFO: No Floating Point Exceptions have been reported
-     1,998,344,168      cycles                           #    2.886 GHz                    
-     3,027,104,836      instructions                     #    1.51  insn per cycle         
-       0.748859673 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.465911 sec
+     2,085,649,672      cycles                           #    2.824 GHz                       
+     2,853,158,366      instructions                     #    1.37  insn per cycle            
+       0.797926486 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.172168e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.234506e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.237328e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.821851 sec
-INFO: No Floating Point Exceptions have been reported
-     6,001,499,639      cycles                           #    2.924 GHz                    
-    13,042,334,044      instructions                     #    2.17  insn per cycle         
-       2.109220847 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626455e-04
+Avg ME (F77/GPU)   = 6.6262665411373489E-004
+Relative difference = 2.8440374627264284e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.941510e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.942442e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.942442e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.887385e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.888309e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.888309e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.452200 sec
-INFO: No Floating Point Exceptions have been reported
-    24,907,540,526      cycles                           #    2.946 GHz                    
-    79,109,866,227      instructions                     #    3.18  insn per cycle         
-       8.456266423 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.694438 sec
+    25,009,094,589      cycles                           #    2.876 GHz                       
+    79,110,682,076      instructions                     #    3.16  insn per cycle            
+       8.698358258 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274863312764526E-004
-Relative difference = 4.998523613136231e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865450727943E-004
+Relative difference = 6.864248936772735e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.017369e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.030395e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.030395e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.786091e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.798676e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.798676e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.341887 sec
-INFO: No Floating Point Exceptions have been reported
-     6,533,658,672      cycles                           #    2.786 GHz                    
-    20,270,788,705      instructions                     #    3.10  insn per cycle         
-       2.345994128 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.421571 sec
+     6,521,561,343      cycles                           #    2.690 GHz                       
+    20,285,907,872      instructions                     #    3.11  insn per cycle            
+       2.425622228 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
 Avg ME (F77/C++)    = 6.6274861442972011E-004
 Relative difference = 2.1772539563413118e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.604029e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.610893e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.610893e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.544765e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.551053e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.551053e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.027451 sec
-INFO: No Floating Point Exceptions have been reported
-     2,836,206,155      cycles                           #    2.751 GHz                    
-     7,065,988,768      instructions                     #    2.49  insn per cycle         
-       1.031531216 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12055) (512y:    0) (512z:    0)
+TOTAL       :     1.066479 sec
+     2,853,976,312      cycles                           #    2.668 GHz                       
+     7,084,427,661      instructions                     #    2.48  insn per cycle            
+       1.070436318 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12085) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271938174396888E-004
 Relative difference = 2.7547150614455683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.796598e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.804847e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.804847e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.733440e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.741292e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.741292e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.917896 sec
-INFO: No Floating Point Exceptions have been reported
-     2,527,698,465      cycles                           #    2.744 GHz                    
-     6,403,574,368      instructions                     #    2.53  insn per cycle         
-       0.921906155 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11019) (512y:   44) (512z:    0)
+TOTAL       :     0.951193 sec
+     2,545,293,522      cycles                           #    2.667 GHz                       
+     6,429,326,530      instructions                     #    2.53  insn per cycle            
+       0.955037744 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11116) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271938174396888E-004
 Relative difference = 2.7547150614455683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.414079e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.419125e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.419125e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.345267e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.349883e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.349883e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.164994 sec
-INFO: No Floating Point Exceptions have been reported
-     2,068,678,617      cycles                           #    1.770 GHz                    
-     3,304,093,166      instructions                     #    1.60  insn per cycle         
-       1.169236265 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2603) (512y:   44) (512z: 9605)
+TOTAL       :     1.224208 sec
+     2,101,816,780      cycles                           #    1.713 GHz                       
+     3,321,301,841      instructions                     #    1.58  insn per cycle            
+       1.228087953 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   14) (512z: 9619)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
 Avg ME (F77/C++)    = 6.6271952779718007E-004
 Relative difference = 4.194411063934945e-08
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..30c823393b
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_16:51:59
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 8.013258e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.103080e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.110808e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
+TOTAL       :     0.479902 sec
+     1,978,219,521      cycles                           #    2.831 GHz                       
+     2,863,905,705      instructions                     #    1.45  insn per cycle            
+       0.755864012 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626455e-04
+Avg ME (F77/GPU)   = 6.6262665411373489E-004
+Relative difference = 2.8440374627264284e-05
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.898659e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.899570e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.899570e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
+TOTAL       :     8.643023 sec
+    24,998,550,241      cycles                           #    2.892 GHz                       
+    79,111,084,095      instructions                     #    3.16  insn per cycle            
+       8.646984489 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865450727943E-004
+Relative difference = 6.864248936772735e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.719385e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.731327e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.731327e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
+TOTAL       :     2.445830 sec
+     6,526,769,240      cycles                           #    2.665 GHz                       
+    20,286,103,115      instructions                     #    3.11  insn per cycle            
+       2.449754025 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627486e-04
+Avg ME (F77/C++)    = 6.6274861442972011E-004
+Relative difference = 2.1772539563413118e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.565963e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.572237e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.572237e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
+TOTAL       :     1.052461 sec
+     2,851,588,130      cycles                           #    2.701 GHz                       
+     7,084,479,012      instructions                     #    2.48  insn per cycle            
+       1.056444800 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12085) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627194e-04
+Avg ME (F77/C++)    = 6.6271938174396888E-004
+Relative difference = 2.7547150614455683e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.748496e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.756542e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.756542e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
+TOTAL       :     0.942761 sec
+     2,539,647,091      cycles                           #    2.684 GHz                       
+     6,429,491,013      instructions                     #    2.53  insn per cycle            
+       0.946755867 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11116) (512y:    9) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627194e-04
+Avg ME (F77/C++)    = 6.6271938174396888E-004
+Relative difference = 2.7547150614455683e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.348567e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.353355e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.353355e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
+TOTAL       :     1.221456 sec
+     2,102,747,652      cycles                           #    1.717 GHz                       
+     3,321,271,092      instructions                     #    1.58  insn per cycle            
+       1.225405100 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   14) (512z: 9619)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627195e-04
+Avg ME (F77/C++)    = 6.6271952779718007E-004
+Relative difference = 4.194411063934945e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
index 59696ff16e..b51802abeb 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:45:30
+DATE: 2025-10-11_16:38:43
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.026958e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.513975e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.517845e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.478506 sec
-INFO: No Floating Point Exceptions have been reported
-     1,992,355,788      cycles                           #    2.865 GHz                    
-     3,027,729,409      instructions                     #    1.52  insn per cycle         
-       0.751914958 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst
-WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 7.083410e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.111715e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.119810e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.048177e+00 +- 2.364571e+00 )  GeV^-4
+TOTAL       :     0.467709 sec
+     2,010,523,047      cycles                           #    2.824 GHz                       
+     2,892,361,831      instructions                     #    1.44  insn per cycle            
+       0.770628946 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst OMP=
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.156008e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.226322e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.229025e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.641709e+00 +- 4.994248e+00 )  GeV^-4
-TOTAL       :     1.900625 sec
-INFO: No Floating Point Exceptions have been reported
-     6,225,372,770      cycles                           #    2.919 GHz                    
-    12,616,761,411      instructions                     #    2.03  insn per cycle         
-       2.188103626 seconds time elapsed
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626455e-04
+Avg ME (F77/GPU)   = 6.6262665411373489E-004
+Relative difference = 2.8440374627264284e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.942577e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.943527e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.943527e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.889714e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.890621e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.890621e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.447888 sec
-INFO: No Floating Point Exceptions have been reported
-    24,912,816,300      cycles                           #    2.948 GHz                    
-    79,110,249,403      instructions                     #    3.18  insn per cycle         
-       8.452014602 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.683941 sec
+    25,012,693,300      cycles                           #    2.880 GHz                       
+    79,111,053,402      instructions                     #    3.16  insn per cycle            
+       8.687777898 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274863312764526E-004
-Relative difference = 4.998523613136231e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865450727943E-004
+Relative difference = 6.864248936772735e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.980733e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.993141e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.993141e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.774197e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.786532e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.786532e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.354354 sec
-INFO: No Floating Point Exceptions have been reported
-     6,535,460,807      cycles                           #    2.772 GHz                    
-    20,270,869,690      instructions                     #    3.10  insn per cycle         
-       2.358646539 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.425829 sec
+     6,538,669,629      cycles                           #    2.692 GHz                       
+    20,286,236,268      instructions                     #    3.10  insn per cycle            
+       2.429903422 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
 Avg ME (F77/C++)    = 6.6274861442972011E-004
 Relative difference = 2.1772539563413118e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.603543e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.610156e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.610156e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.538774e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.544893e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.544893e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.027888 sec
-INFO: No Floating Point Exceptions have been reported
-     2,837,672,612      cycles                           #    2.752 GHz                    
-     7,066,358,168      instructions                     #    2.49  insn per cycle         
-       1.031930682 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12055) (512y:    0) (512z:    0)
+TOTAL       :     1.071044 sec
+     2,851,268,280      cycles                           #    2.654 GHz                       
+     7,084,649,438      instructions                     #    2.48  insn per cycle            
+       1.074854505 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12085) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271938174396888E-004
 Relative difference = 2.7547150614455683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.798975e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.807399e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.807399e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.734960e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.742729e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.742729e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.916670 sec
-INFO: No Floating Point Exceptions have been reported
-     2,525,901,356      cycles                           #    2.745 GHz                    
-     6,403,453,175      instructions                     #    2.54  insn per cycle         
-       0.920789172 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11019) (512y:   44) (512z:    0)
+TOTAL       :     0.950344 sec
+     2,540,286,423      cycles                           #    2.664 GHz                       
+     6,429,424,927      instructions                     #    2.53  insn per cycle            
+       0.954335905 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11116) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271938174396888E-004
 Relative difference = 2.7547150614455683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.406582e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.411589e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.411589e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.326881e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.331538e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.331538e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.171278 sec
-INFO: No Floating Point Exceptions have been reported
-     2,071,908,739      cycles                           #    1.764 GHz                    
-     3,303,987,486      instructions                     #    1.59  insn per cycle         
-       1.175442581 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2603) (512y:   44) (512z: 9605)
+TOTAL       :     1.241226 sec
+     2,102,177,412      cycles                           #    1.689 GHz                       
+     3,321,695,580      instructions                     #    1.58  insn per cycle            
+       1.245320786 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   14) (512z: 9619)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
 Avg ME (F77/C++)    = 6.6271952779718007E-004
 Relative difference = 4.194411063934945e-08
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
index fc006f8d57..a1ed0e1048 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:05:24
+DATE: 2025-10-11_15:26:49
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.473150e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.513248e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.516891e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.023167e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.101141e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.108760e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.500476 sec
-INFO: No Floating Point Exceptions have been reported
-     2,066,687,911      cycles                           #    2.859 GHz                    
-     3,064,980,702      instructions                     #    1.48  insn per cycle         
-       0.941605450 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.481972 sec
+     2,053,644,686      cycles                           #    2.818 GHz                       
+     2,906,367,138      instructions                     #    1.42  insn per cycle            
+       0.790666270 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.096999e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.159101e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.161763e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.803372 sec
-INFO: No Floating Point Exceptions have been reported
-     5,931,019,959      cycles                           #    2.909 GHz                    
-    12,491,679,666      instructions                     #    2.11  insn per cycle         
-       2.096189929 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626455e-04
+Avg ME (F77/GPU)   = 6.6262665411373489E-004
+Relative difference = 2.8440374627264284e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.927739e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.928675e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.928675e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.911966e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.912904e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.912904e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.512686 sec
-INFO: No Floating Point Exceptions have been reported
-    24,976,995,918      cycles                           #    2.933 GHz                    
-    78,849,322,260      instructions                     #    3.16  insn per cycle         
-       8.521021644 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3092) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.582602 sec
+    24,849,332,204      cycles                           #    2.895 GHz                       
+    78,811,199,944      instructions                     #    3.17  insn per cycle            
+       8.586531797 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2999) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627487e-04
-Avg ME (F77/C++)    = 6.6274866250177339E-004
-Relative difference = 5.65798569465384e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627486e-04
+Avg ME (F77/C++)    = 6.6274863279149748E-004
+Relative difference = 4.947803358686673e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.196617e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.210064e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.210064e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.802565e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.815087e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.815087e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.283841 sec
-INFO: No Floating Point Exceptions have been reported
-     6,462,353,077      cycles                           #    2.825 GHz                    
-    20,230,287,596      instructions                     #    3.13  insn per cycle         
-       2.291660153 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13491) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.415633 sec
+     6,482,490,857      cycles                           #    2.680 GHz                       
+    20,247,828,097      instructions                     #    3.12  insn per cycle            
+       2.419608944 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13541) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
 Avg ME (F77/C++)    = 6.6274861448331612E-004
 Relative difference = 2.1853408865157068e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.507603e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.513399e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.513399e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.493020e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.499074e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.499074e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.094262 sec
-INFO: No Floating Point Exceptions have been reported
-     2,977,852,840      cycles                           #    2.716 GHz                    
-     7,207,139,157      instructions                     #    2.42  insn per cycle         
-       1.100869463 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12437) (512y:    0) (512z:    0)
+TOTAL       :     1.103256 sec
+     2,994,004,582      cycles                           #    2.706 GHz                       
+     7,224,670,986      instructions                     #    2.41  insn per cycle            
+       1.107361000 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12455) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271939668088170E-004
 Relative difference = 5.008331292535666e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.740158e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.747960e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.747960e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.703839e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.711671e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.711671e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.947565 sec
-INFO: No Floating Point Exceptions have been reported
-     2,615,044,427      cycles                           #    2.750 GHz                    
-     6,545,142,442      instructions                     #    2.50  insn per cycle         
-       0.954571468 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11449) (512y:   27) (512z:    0)
+TOTAL       :     0.967356 sec
+     2,634,233,834      cycles                           #    2.714 GHz                       
+     6,565,459,296      instructions                     #    2.49  insn per cycle            
+       0.971230309 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11486) (512y:   13) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627194e-04
 Avg ME (F77/C++)    = 6.6271939668088170E-004
 Relative difference = 5.008331292535666e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.344321e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.349023e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.349023e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.318889e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.323344e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.323344e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.225060 sec
-INFO: No Floating Point Exceptions have been reported
-     2,140,395,059      cycles                           #    1.742 GHz                    
-     3,462,158,546      instructions                     #    1.62  insn per cycle         
-       1.232075146 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3037) (512y:   25) (512z: 9677)
+TOTAL       :     1.248532 sec
+     2,165,605,341      cycles                           #    1.730 GHz                       
+     3,476,565,175      instructions                     #    1.61  insn per cycle            
+       1.252574898 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3018) (512y:   20) (512z: 9665)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
 Avg ME (F77/C++)    = 6.6271952032316561E-004
 Relative difference = 3.066631594207157e-08
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
index 507fa267fb..c3e94ba26d 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:32:18
+DATE: 2025-10-11_16:22:45
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.570913e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.612300e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.616113e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 4.059597e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.500062 sec
-INFO: No Floating Point Exceptions have been reported
-     2,077,093,809      cycles                           #    2.883 GHz                    
-     3,095,482,027      instructions                     #    1.49  insn per cycle         
-       0.782648151 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 7.980018e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.060840e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.068475e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
+TOTAL       :     0.483472 sec
+     2,078,701,556      cycles                           #    2.836 GHz                       
+     2,938,258,784      instructions                     #    1.41  insn per cycle            
+       0.794272127 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.624378e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.693284e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.696098e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.736663 sec
-INFO: No Floating Point Exceptions have been reported
-     5,745,039,966      cycles                           #    2.917 GHz                    
-    12,243,347,327      instructions                     #    2.13  insn per cycle         
-       2.029186282 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262669162351490E-004
-Relative difference = 2.8232862531213374e-05
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626455e-04
+Avg ME (F77/GPU)   = 6.6262664051428000E-004
+Relative difference = 2.8460897599042618e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.610943e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.611718e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.611718e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.536396e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.537181e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.537181e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    29.233986 sec
-INFO: No Floating Point Exceptions have been reported
-    86,131,386,822      cycles                           #    2.946 GHz                    
-   135,652,659,903      instructions                     #    1.57  insn per cycle         
-      29.237672033 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:15856) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    29.627851 sec
+    85,239,542,827      cycles                           #    2.877 GHz                       
+   134,215,968,109      instructions                     #    1.57  insn per cycle            
+      29.631730646 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:15099) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627535e-04
-Avg ME (F77/C++)    = 6.6275349717465765E-004
-Relative difference = 4.26303654465793e-09
+Avg ME (F77/C++)    = 6.6275349049735310E-004
+Relative difference = 1.4338131648076968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.849906e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.862163e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.862163e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.562878e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.574411e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.574411e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.399244 sec
-INFO: No Floating Point Exceptions have been reported
-     6,757,771,203      cycles                           #    2.813 GHz                    
-    19,352,943,673      instructions                     #    2.86  insn per cycle         
-       2.403059869 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:69577) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.504142 sec
+     6,771,535,920      cycles                           #    2.701 GHz                       
+    19,207,882,725      instructions                     #    2.84  insn per cycle            
+       2.508192424 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:68781) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
 Avg ME (F77/C++)    = 6.6274862748188362E-004
 Relative difference = 4.14665283800746e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.430057e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.435326e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.435326e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.450780e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.456226e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.456226e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.151867 sec
-INFO: No Floating Point Exceptions have been reported
-     3,169,480,733      cycles                           #    2.744 GHz                    
-     6,794,963,559      instructions                     #    2.14  insn per cycle         
-       1.155607574 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:49034) (512y:    0) (512z:    0)
+TOTAL       :     1.135519 sec
+     3,073,910,834      cycles                           #    2.700 GHz                       
+     6,671,130,394      instructions                     #    2.17  insn per cycle            
+       1.139479935 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:47844) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627273e-04
 Avg ME (F77/C++)    = 6.6272731568543797E-004
 Relative difference = 2.3668012430631962e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.731154e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.739005e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.739005e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.771981e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.780020e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.780020e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     0.952402 sec
-INFO: No Floating Point Exceptions have been reported
-     2,622,407,179      cycles                           #    2.744 GHz                    
-     5,970,044,618      instructions                     #    2.28  insn per cycle         
-       0.956238068 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:42602) (512y:   11) (512z:    0)
+TOTAL       :     0.930511 sec
+     2,525,041,206      cycles                           #    2.704 GHz                       
+     5,950,807,908      instructions                     #    2.36  insn per cycle            
+       0.934389144 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:42169) (512y:   10) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627273e-04
 Avg ME (F77/C++)    = 6.6272731568543797E-004
 Relative difference = 2.3668012430631962e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.414435e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.419474e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.419474e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.326409e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.331048e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.331048e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060905e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.165045 sec
-INFO: No Floating Point Exceptions have been reported
-     2,067,228,248      cycles                           #    1.769 GHz                    
-     3,495,098,954      instructions                     #    1.69  insn per cycle         
-       1.168981438 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5208) (512y:    3) (512z:44858)
+TOTAL       :     1.241611 sec
+     2,116,308,082      cycles                           #    1.700 GHz                       
+     3,522,579,874      instructions                     #    1.66  insn per cycle            
+       1.245792482 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5213) (512y:    3) (512z:44839)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627275e-04
 Avg ME (F77/C++)    = 6.6272750237027223E-004
 Relative difference = 3.5765412974815996e-09
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
index 2595c32afa..0bef615dd8 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:33:09
+DATE: 2025-10-11_16:23:46
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.573938e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.613715e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.617455e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 4.059597e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.493227 sec
-INFO: No Floating Point Exceptions have been reported
-     2,049,677,908      cycles                           #    2.879 GHz                    
-     3,032,655,926      instructions                     #    1.48  insn per cycle         
-       0.769218706 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 8.071174e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.149873e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.157266e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
+TOTAL       :     0.480187 sec
+     2,056,422,141      cycles                           #    2.821 GHz                       
+     2,909,868,255      instructions                     #    1.42  insn per cycle            
+       0.789769149 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.673337e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.742674e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.745488e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.731870 sec
-INFO: No Floating Point Exceptions have been reported
-     5,773,880,906      cycles                           #    2.919 GHz                    
-    12,286,627,464      instructions                     #    2.13  insn per cycle         
-       2.034768323 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262669162351490E-004
-Relative difference = 2.8232862531213374e-05
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626455e-04
+Avg ME (F77/GPU)   = 6.6262664051428000E-004
+Relative difference = 2.8460897599042618e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.600277e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.601076e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.601076e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.550689e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.551508e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.551508e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    29.289301 sec
-INFO: No Floating Point Exceptions have been reported
-    86,207,606,672      cycles                           #    2.943 GHz                    
-   135,355,986,373      instructions                     #    1.57  insn per cycle         
-      29.293063672 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:15471) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    29.550873 sec
+    85,210,035,482      cycles                           #    2.883 GHz                       
+   134,053,525,503      instructions                     #    1.57  insn per cycle            
+      29.554932127 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:15171) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627535e-04
-Avg ME (F77/C++)    = 6.6275349662128086E-004
-Relative difference = 5.098002770919431e-09
+Avg ME (F77/C++)    = 6.6275349729240374E-004
+Relative difference = 4.085374577342176e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.848001e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.860244e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.860244e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.704049e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.715826e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.715826e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.399823 sec
-INFO: No Floating Point Exceptions have been reported
-     6,855,955,670      cycles                           #    2.853 GHz                    
-    19,471,788,292      instructions                     #    2.84  insn per cycle         
-       2.403723205 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:69876) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.451563 sec
+     6,575,110,645      cycles                           #    2.679 GHz                       
+    19,101,194,250      instructions                     #    2.91  insn per cycle            
+       2.455617178 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:68204) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
 Avg ME (F77/C++)    = 6.6274862799683282E-004
 Relative difference = 4.2243518621014775e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.455129e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.460639e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.460639e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.461044e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.466509e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.466509e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.132031 sec
-INFO: No Floating Point Exceptions have been reported
-     3,102,391,764      cycles                           #    2.733 GHz                    
-     6,715,014,781      instructions                     #    2.16  insn per cycle         
-       1.135898458 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:47692) (512y:    0) (512z:    0)
+TOTAL       :     1.127472 sec
+     3,056,173,108      cycles                           #    2.702 GHz                       
+     6,654,226,606      instructions                     #    2.18  insn per cycle            
+       1.131533762 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:47010) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627273e-04
 Avg ME (F77/C++)    = 6.6272731623419345E-004
 Relative difference = 2.449603850635964e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.738588e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.746518e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.746518e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.769806e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.777757e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.777757e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     0.948137 sec
-INFO: No Floating Point Exceptions have been reported
-     2,626,199,962      cycles                           #    2.761 GHz                    
-     5,966,019,567      instructions                     #    2.27  insn per cycle         
-       0.951931849 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:41858) (512y:   13) (512z:    0)
+TOTAL       :     0.931579 sec
+     2,522,992,718      cycles                           #    2.700 GHz                       
+     5,975,076,879      instructions                     #    2.37  insn per cycle            
+       0.935429613 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:41660) (512y:   11) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627273e-04
 Avg ME (F77/C++)    = 6.6272731623419345E-004
 Relative difference = 2.449603850635964e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.414552e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.419616e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.419616e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.345570e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.350413e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.350413e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060905e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.164736 sec
-INFO: No Floating Point Exceptions have been reported
-     2,067,746,434      cycles                           #    1.771 GHz                    
-     3,487,891,958      instructions                     #    1.69  insn per cycle         
-       1.168545250 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4171) (512y:    4) (512z:44494)
+TOTAL       :     1.223621 sec
+     2,097,428,008      cycles                           #    1.710 GHz                       
+     3,514,537,932      instructions                     #    1.68  insn per cycle            
+       1.227733047 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4173) (512y:    4) (512z:44470)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627275e-04
 Avg ME (F77/C++)    = 6.6272750247886592E-004
 Relative difference = 3.740400032174438e-09
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..10d80cdca4
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_15:43:12
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.858419e+05    1 256
+3.745329e+05    2 256
+3.897177e+05    4 256
+4.239569e+05    8 256
+4.437166e+05   16 256
+4.444009e+05   32 256
+4.485074e+05   64 256
+4.433314e+05  128 256
+4.512938e+05  256 256
+4.568500e+05  512 256
+4.555629e+05 1024 256
+### GPU: scaling test 32
+5.657558e+04    1  32
+1.070333e+05    2  32
+1.849532e+05    4  32
+2.657280e+05    8  32
+3.949685e+05   16  32
+3.946154e+05   32  32
+4.350193e+05   64  32
+4.473966e+05  128  32
+4.519860e+05  256  32
+4.459799e+05  512  32
+4.463425e+05 1024  32
+4.512453e+05 2048  32
+4.596972e+05 4096  32
+4.567015e+05 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.832892e+03    1 256
+1.824058e+03    2 256
+1.836696e+03    4 256
+### CPU: scaling test 32
+1.828347e+03    1  32
+1.832242e+03    2  32
+1.831046e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.486552e+03    1 256
+3.490138e+03    2 256
+3.498447e+03    4 256
+### CPU: scaling test 32
+3.349673e+03    1  32
+3.424966e+03    2  32
+3.419275e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.965219e+03    1 256
+7.977523e+03    2 256
+8.081277e+03    4 256
+### CPU: scaling test 32
+7.768804e+03    1  32
+7.471564e+03    2  32
+7.954694e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.159079e+03    1 256
+9.181848e+03    2 256
+9.256886e+03    4 256
+### CPU: scaling test 32
+8.945974e+03    1  32
+8.898384e+03    2  32
+8.978221e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.830723e+03    1 256
+6.905755e+03    2 256
+6.932432e+03    4 256
+### CPU: scaling test 32
+6.653413e+03    1  32
+6.716747e+03    2  32
+6.760196e+03    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index a3a2deda6e..e3e2b43997 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:03:48
+DATE: 2025-10-11_15:24:46
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.318725e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.347238e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.349358e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.393156e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.441810e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.445057e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.539210 sec
-INFO: No Floating Point Exceptions have been reported
-     2,220,963,802      cycles                           #    2.880 GHz                    
-     3,406,426,816      instructions                     #    1.53  insn per cycle         
-       0.832307462 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.502434 sec
+     2,151,870,507      cycles                           #    2.842 GHz                       
+     3,130,235,445      instructions                     #    1.45  insn per cycle            
+       0.824960007 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.134167e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.164785e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.165985e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.047029 sec
-INFO: No Floating Point Exceptions have been reported
-     9,687,290,131      cycles                           #    2.924 GHz                    
-    21,862,744,253      instructions                     #    2.26  insn per cycle         
-       3.379254641 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266732376103494E-004
-Relative difference = 2.659538381540814e-07
+Avg ME (F77/GPU)   = 6.6266731567731949E-004
+Relative difference = 2.781525885774229e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.868179e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.869079e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.869079e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.825164e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.826053e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.826053e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.786228 sec
-INFO: No Floating Point Exceptions have been reported
-    25,910,148,307      cycles                           #    2.949 GHz                    
-    79,427,985,275      instructions                     #    3.07  insn per cycle         
-       8.790193498 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4775) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.992021 sec
+    26,029,577,464      cycles                           #    2.894 GHz                       
+    79,114,128,675      instructions                     #    3.04  insn per cycle            
+       8.996124488 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731406016235E-004
 Relative difference = 2.8059296349552523e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.521065e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.524381e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.524381e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.429291e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.432449e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.432449e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.666859 sec
-INFO: No Floating Point Exceptions have been reported
-    12,831,991,791      cycles                           #    2.749 GHz                    
-    38,825,085,312      instructions                     #    3.03  insn per cycle         
-       4.671138327 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13173) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.789072 sec
+    12,824,725,318      cycles                           #    2.676 GHz                       
+    38,757,792,368      instructions                     #    3.02  insn per cycle            
+       4.793199776 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730246908442E-004
 Relative difference = 2.98084507782618e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.087173e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.104021e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.104021e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.935628e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.953025e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.953025e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.035173 sec
-INFO: No Floating Point Exceptions have been reported
-     5,594,158,972      cycles                           #    2.744 GHz                    
-    13,617,938,147      instructions                     #    2.43  insn per cycle         
-       2.039272194 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11427) (512y:    0) (512z:    0)
+TOTAL       :     2.072950 sec
+     5,562,263,841      cycles                           #    2.679 GHz                       
+    13,540,518,730      instructions                     #    2.43  insn per cycle            
+       2.077092697 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11399) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.329915e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.351715e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.351715e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.986204e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.007643e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.007643e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.764965 sec
-INFO: No Floating Point Exceptions have been reported
-     4,865,961,098      cycles                           #    2.752 GHz                    
-    12,296,280,016      instructions                     #    2.53  insn per cycle         
-       1.768959352 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10331) (512y:   80) (512z:    0)
+TOTAL       :     1.831318 sec
+     4,854,515,630      cycles                           #    2.646 GHz                       
+    12,237,415,635      instructions                     #    2.52  insn per cycle            
+       1.835524858 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10382) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.944494e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.956947e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.956947e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.899014e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.911241e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.911241e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.368908 sec
-INFO: No Floating Point Exceptions have been reported
-     4,175,656,001      cycles                           #    1.761 GHz                    
-     6,394,856,033      instructions                     #    1.53  insn per cycle         
-       2.373043514 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1983) (512y:   92) (512z: 9360)
+TOTAL       :     2.383753 sec
+     4,111,562,734      cycles                           #    1.722 GHz                       
+     6,282,557,303      instructions                     #    1.53  insn per cycle            
+       2.388073448 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1904) (512y:   61) (512z: 9361)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..5eb0658f4e
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_15:59:44
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.478169e+05    1 256
+2.269338e+05    2 256
+2.908405e+05    4 256
+3.460040e+05    8 256
+3.706753e+05   16 256
+3.850253e+05   32 256
+3.834285e+05   64 256
+3.887436e+05  128 256
+3.877878e+05  256 256
+3.930166e+05  512 256
+4.044746e+05 1024 256
+### GPU: scaling test 32
+2.315019e+04    1  32
+4.199167e+04    2  32
+8.231040e+04    4  32
+1.430769e+05    8  32
+2.353840e+05   16  32
+2.941154e+05   32  32
+3.501493e+05   64  32
+3.762161e+05  128  32
+3.849858e+05  256  32
+3.843601e+05  512  32
+3.882366e+05 1024  32
+3.853348e+05 2048  32
+3.939954e+05 4096  32
+4.042764e+05 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.820929e+03    1 256
+1.819554e+03    2 256
+1.824693e+03    4 256
+### CPU: scaling test 32
+1.809922e+03    1  32
+1.818380e+03    2  32
+1.829598e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.467484e+03    1 256
+3.477201e+03    2 256
+3.483666e+03    4 256
+### CPU: scaling test 32
+3.376210e+03    1  32
+3.385787e+03    2  32
+3.462870e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.773756e+03    1 256
+7.868538e+03    2 256
+7.891583e+03    4 256
+### CPU: scaling test 32
+7.767594e+03    1  32
+7.512875e+03    2  32
+7.861406e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.905874e+03    1 256
+9.000800e+03    2 256
+9.159354e+03    4 256
+### CPU: scaling test 32
+9.007891e+03    1  32
+8.853559e+03    2  32
+8.999340e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.725095e+03    1 256
+6.926689e+03    2 256
+6.793100e+03    4 256
+### CPU: scaling test 32
+6.759773e+03    1  32
+6.705987e+03    2  32
+6.758642e+03    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..8b06b13019
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_15:53:12
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 3.813357e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.847839e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.850325e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     1.193508 sec
+     4,401,135,195      cycles                           #    2.829 GHz                       
+     6,108,788,422      instructions                     #    1.39  insn per cycle            
+       1.613268691 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626675e-04
+Avg ME (F77/GPU)   = 6.6266733778757203E-004
+Relative difference = 2.447870582934832e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.815440e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.816305e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.816305e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     9.040328 sec
+    26,031,336,563      cycles                           #    2.879 GHz                       
+    79,117,154,926      instructions                     #    3.04  insn per cycle            
+       9.044442399 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731406016235E-004
+Relative difference = 2.8059296349552523e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.427905e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.431039e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.431039e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     4.790651 sec
+    12,832,687,294      cycles                           #    2.677 GHz                       
+    38,758,106,395      instructions                     #    3.02  insn per cycle            
+       4.794734568 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730246908442E-004
+Relative difference = 2.98084507782618e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 7.935202e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.951558e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.951558e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.072958 sec
+     5,568,085,348      cycles                           #    2.682 GHz                       
+    13,540,506,751      instructions                     #    2.43  insn per cycle            
+       2.076971724 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11399) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 9.161412e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.183655e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.183655e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     1.796303 sec
+     4,854,337,043      cycles                           #    2.698 GHz                       
+    12,237,142,563      instructions                     #    2.52  insn per cycle            
+       1.800481736 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10382) (512y:   45) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.873484e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.885441e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.885441e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.392508 sec
+     4,106,170,622      cycles                           #    1.714 GHz                       
+     6,282,499,145      instructions                     #    1.53  insn per cycle            
+       2.396728116 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1904) (512y:   61) (512z: 9361)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..1a693ccc02
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt
@@ -0,0 +1,223 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-10-11_16:51:16
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 4.425282e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.474579e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.477977e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     0.505604 sec
+     2,079,342,335      cycles                           #    2.823 GHz                       
+     3,110,113,358      instructions                     #    1.50  insn per cycle            
+       0.804143585 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626675e-04
+Avg ME (F77/GPU)   = 6.6266731567731949E-004
+Relative difference = 2.781525885774229e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.820544e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.821419e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.821419e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     9.014922 sec
+    26,029,815,792      cycles                           #    2.887 GHz                       
+    79,113,148,007      instructions                     #    3.04  insn per cycle            
+       9.018853711 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731406016235E-004
+Relative difference = 2.8059296349552523e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.422911e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.426145e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.426145e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     4.797700 sec
+    12,826,872,860      cycles                           #    2.672 GHz                       
+    38,756,601,713      instructions                     #    3.02  insn per cycle            
+       4.801871860 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730246908442E-004
+Relative difference = 2.98084507782618e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 7.944046e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.960023e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.960023e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.070707 sec
+     5,566,396,722      cycles                           #    2.684 GHz                       
+    13,540,340,017      instructions                     #    2.43  insn per cycle            
+       2.074804703 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11399) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 9.072103e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.093961e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.093961e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     1.814093 sec
+     4,852,758,403      cycles                           #    2.670 GHz                       
+    12,237,059,875      instructions                     #    2.52  insn per cycle            
+       1.818055824 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10382) (512y:   45) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.846048e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.858465e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.858465e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.401888 sec
+     4,113,800,876      cycles                           #    1.711 GHz                       
+     6,282,877,511      instructions                     #    1.53  insn per cycle            
+       2.405935799 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1904) (512y:   61) (512z: 9361)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
index f598011718..55816a282e 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:04:22
+DATE: 2025-10-11_15:25:29
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.335025e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.357927e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.359916e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.409960e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.457193e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.460417e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.537353 sec
-INFO: No Floating Point Exceptions have been reported
-     2,216,980,042      cycles                           #    2.869 GHz                    
-     3,463,326,813      instructions                     #    1.56  insn per cycle         
-       0.836472238 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.500032 sec
+     2,128,939,464      cycles                           #    2.818 GHz                       
+     3,048,895,103      instructions                     #    1.43  insn per cycle            
+       0.815266921 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.141323e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.172030e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.173253e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.034442 sec
-INFO: No Floating Point Exceptions have been reported
-     9,665,974,027      cycles                           #    2.922 GHz                    
-    21,248,987,108      instructions                     #    2.20  insn per cycle         
-       3.363171619 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266732376103494E-004
-Relative difference = 2.659538381540814e-07
+Avg ME (F77/GPU)   = 6.6266731567731949E-004
+Relative difference = 2.781525885774229e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.862251e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.863154e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.863154e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.835004e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.835894e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.835894e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.813876 sec
-INFO: No Floating Point Exceptions have been reported
-    25,987,730,158      cycles                           #    2.948 GHz                    
-    79,453,128,863      instructions                     #    3.06  insn per cycle         
-       8.817767368 seconds time elapsed
+TOTAL       :     8.943891 sec
+    25,955,962,699      cycles                           #    2.901 GHz                       
+    79,198,038,648      instructions                     #    3.05  insn per cycle            
+       8.947961266 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731406016235E-004
 Relative difference = 2.8059296349552523e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.512571e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.515785e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.515785e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.464500e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.467677e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.467677e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.675994 sec
-INFO: No Floating Point Exceptions have been reported
-    12,822,983,844      cycles                           #    2.741 GHz                    
-    38,780,874,555      instructions                     #    3.02  insn per cycle         
-       4.681038643 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:12935) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.740131 sec
+    12,742,308,756      cycles                           #    2.686 GHz                       
+    38,685,964,134      instructions                     #    3.04  insn per cycle            
+       4.744223175 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:12933) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730246908442E-004
 Relative difference = 2.98084507782618e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.056370e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.072927e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.072927e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.985627e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.001632e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.001632e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.042531 sec
-INFO: No Floating Point Exceptions have been reported
-     5,590,175,615      cycles                           #    2.733 GHz                    
-    13,732,675,080      instructions                     #    2.46  insn per cycle         
-       2.046647326 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11510) (512y:    0) (512z:    0)
+TOTAL       :     2.059737 sec
+     5,594,595,243      cycles                           #    2.712 GHz                       
+    13,643,577,301      instructions                     #    2.44  insn per cycle            
+       2.063806863 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11479) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.148791e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.170046e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.170046e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.864560e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.884766e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.884766e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.800883 sec
-INFO: No Floating Point Exceptions have been reported
-     4,955,825,709      cycles                           #    2.749 GHz                    
-    12,423,990,964      instructions                     #    2.51  insn per cycle         
-       1.804980058 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10322) (512y:  240) (512z:    0)
+TOTAL       :     1.855976 sec
+     5,031,540,017      cycles                           #    2.706 GHz                       
+    12,343,462,839      instructions                     #    2.45  insn per cycle            
+       1.860103785 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10307) (512y:  226) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.851374e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.863307e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.863307e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.836346e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.848432e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.848432e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.400794 sec
-INFO: No Floating Point Exceptions have been reported
-     4,218,682,996      cycles                           #    1.755 GHz                    
-     6,496,899,309      instructions                     #    1.54  insn per cycle         
-       2.406253121 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1806) (512y:  190) (512z: 9358)
+TOTAL       :     2.405420 sec
+     4,109,302,173      cycles                           #    1.706 GHz                       
+     6,383,895,140      instructions                     #    1.55  insn per cycle            
+       2.409513085 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1734) (512y:  178) (512z: 9357)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..f43e214106
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,118 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+DATE: 2025-10-11_15:45:06
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.314898e+04    1 256
+1.332401e+04    2 256
+1.369745e+04    4 256
+1.359022e+04    8 256
+1.360893e+04   16 256
+1.354758e+04   32 256
+1.335068e+04   64 256
+1.340355e+04  128 256
+1.338225e+04  256 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+6.222590e+03    1  32
+1.054070e+04    2  32
+1.256578e+04    4  32
+1.334543e+04    8  32
+1.351998e+04   16  32
+1.363026e+04   32  32
+1.353031e+04   64  32
+1.331302e+04  128  32
+1.311792e+04  256  32
+1.318049e+04  512  32
+1.308983e+04 1024  32
+1.314766e+04 2048  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.572551e+01    1 256
+7.477397e+01    2 256
+7.590781e+01    4 256
+### CPU: scaling test 32
+7.544857e+01    1  32
+7.629914e+01    2  32
+7.644630e+01    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.436664e+02    1 256
+1.430259e+02    2 256
+1.425156e+02    4 256
+### CPU: scaling test 32
+1.332283e+02    1  32
+1.407923e+02    2  32
+1.434345e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.322512e+02    1 256
+3.302235e+02    2 256
+3.299895e+02    4 256
+### CPU: scaling test 32
+3.290820e+02    1  32
+3.272276e+02    2  32
+3.284861e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.744622e+02    1 256
+3.794847e+02    2 256
+3.813583e+02    4 256
+### CPU: scaling test 32
+3.817338e+02    1  32
+3.782027e+02    2  32
+3.808702e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.362403e+02    1 256
+3.316419e+02    2 256
+3.338911e+02    4 256
+### CPU: scaling test 32
+3.305571e+02    1  32
+3.318824e+02    2  32
+3.293878e+02    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index 17692fc5fb..cc68408e75 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,215 +25,189 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:07:10
+DATE: 2025-10-11_15:29:32
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.059500e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.059934e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.060148e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.298542e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.302743e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.303449e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.453264 sec
-INFO: No Floating Point Exceptions have been reported
-     8,089,923,192      cycles                           #    2.904 GHz                    
-    15,932,007,883      instructions                     #    1.97  insn per cycle         
-       2.843483231 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.859583 sec
+     3,373,995,346      cycles                           #    2.854 GHz                       
+     5,824,456,888      instructions                     #    1.73  insn per cycle            
+       1.243469488 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.246459e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.248360e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.248591e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.340939e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.341409e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.341443e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     4.019480 sec
-INFO: No Floating Point Exceptions have been reported
-    12,563,980,059      cycles                           #    2.886 GHz                    
-    29,860,686,581      instructions                     #    2.38  insn per cycle         
-       4.410635015 seconds time elapsed
+TOTAL       :     2.040862 sec
+     6,994,210,497      cycles                           #    2.880 GHz                       
+    14,374,198,066      instructions                     #    2.06  insn per cycle            
+       2.485321107 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 9.872263e-03
-Avg ME (F77/GPU)   = 9.8722595284406640E-003
-Relative difference = 3.5164777671934515e-07
+Avg ME (F77/GPU)   = 9.8722595284406675E-003
+Relative difference = 3.5164777636791134e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.535286e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.535490e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.535490e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.481211e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.481430e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.481430e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     7.007645 sec
-INFO: No Floating Point Exceptions have been reported
-    18,987,096,753      cycles                           #    2.709 GHz                    
-    53,904,905,030      instructions                     #    2.84  insn per cycle         
-       7.011475835 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32424) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.060224 sec
+    18,790,658,377      cycles                           #    2.660 GHz                       
+    53,598,343,943      instructions                     #    2.85  insn per cycle            
+       7.064353743 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.576045e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.576133e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.576133e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.428763e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.428836e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.428836e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.352060 sec
-INFO: No Floating Point Exceptions have been reported
-     9,813,557,960      cycles                           #    2.925 GHz                    
-    27,153,109,398      instructions                     #    2.77  insn per cycle         
-       3.355902855 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96492) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.697310 sec
+     9,985,153,992      cycles                           #    2.699 GHz                       
+    27,152,471,347      instructions                     #    2.72  insn per cycle            
+       3.701453086 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.392533e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.392946e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.392946e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.245847e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.246221e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.246221e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.558312 sec
-INFO: No Floating Point Exceptions have been reported
-     4,259,121,658      cycles                           #    2.728 GHz                    
-     9,591,809,021      instructions                     #    2.25  insn per cycle         
-       1.562248696 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84961) (512y:    0) (512z:    0)
+TOTAL       :     1.628561 sec
+     4,350,647,315      cycles                           #    2.666 GHz                       
+     9,591,385,784      instructions                     #    2.20  insn per cycle            
+       1.632600458 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84998) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285411531E-003
 Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.852746e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.853256e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.853256e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.817880e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.818408e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.818408e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.371089 sec
-INFO: No Floating Point Exceptions have been reported
-     3,728,351,942      cycles                           #    2.713 GHz                    
-     8,515,110,933      instructions                     #    2.28  insn per cycle         
-       1.374961080 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80609) (512y:   90) (512z:    0)
+TOTAL       :     1.385265 sec
+     3,747,713,325      cycles                           #    2.699 GHz                       
+     8,516,229,683      instructions                     #    2.27  insn per cycle            
+       1.389377029 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80598) (512y:   55) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285411531E-003
 Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.432608e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.433087e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.433087e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.278490e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.278974e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.278974e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.541076 sec
-INFO: No Floating Point Exceptions have been reported
-     2,702,698,179      cycles                           #    1.750 GHz                    
-     4,282,306,811      instructions                     #    1.58  insn per cycle         
-       1.545099546 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2856) (512y:  102) (512z:79114)
+TOTAL       :     1.612258 sec
+     2,716,765,553      cycles                           #    1.682 GHz                       
+     4,276,097,512      instructions                     #    1.57  insn per cycle            
+       1.616451427 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2866) (512y:   71) (512z:79097)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285411531E-003
 Relative difference = 3.516375977906115e-07
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..8b91486c13
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,118 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+DATE: 2025-10-11_16:01:16
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.582972e+04    1 256
+1.581496e+04    2 256
+1.648948e+04    4 256
+1.646203e+04    8 256
+1.669439e+04   16 256
+1.647826e+04   32 256
+1.616020e+04   64 256
+1.617952e+04  128 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+6.365790e+03    1  32
+1.117842e+04    2  32
+1.456730e+04    4  32
+1.611806e+04    8  32
+1.598649e+04   16  32
+1.653700e+04   32  32
+1.595595e+04   64  32
+1.589958e+04  128  32
+1.560604e+04  256  32
+1.549794e+04  512  32
+1.560588e+04 1024  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.550960e+01    1 256
+7.583079e+01    2 256
+7.562936e+01    4 256
+### CPU: scaling test 32
+7.095115e+01    1  32
+7.526184e+01    2  32
+7.561728e+01    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.416397e+02    1 256
+1.419941e+02    2 256
+1.424152e+02    4 256
+### CPU: scaling test 32
+1.379937e+02    1  32
+1.386213e+02    2  32
+1.419191e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.312097e+02    1 256
+3.311144e+02    2 256
+3.322186e+02    4 256
+### CPU: scaling test 32
+3.304901e+02    1  32
+3.322880e+02    2  32
+3.277376e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.821829e+02    1 256
+3.805165e+02    2 256
+3.788227e+02    4 256
+### CPU: scaling test 32
+3.729139e+02    1  32
+3.757926e+02    2  32
+3.738019e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.317613e+02    1 256
+3.319298e+02    2 256
+3.365958e+02    4 256
+### CPU: scaling test 32
+3.353901e+02    1  32
+3.366346e+02    2  32
+3.378136e+02    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
index 1cf857b709..4b40dd2c65 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,239 +25,197 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:40:38
+DATE: 2025-10-11_16:32:38
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.054825e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.057209e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.057209e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.248729e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.286569e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.286569e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.388056 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     7,931,671,790      cycles                           #    2.924 GHz                    
-    17,623,602,431      instructions                     #    2.22  insn per cycle         
-       2.770306640 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge
+TOTAL       :     0.825135 sec
+     3,263,718,300      cycles                           #    2.850 GHz                       
+     5,063,977,049      instructions                     #    1.55  insn per cycle            
+       1.201910757 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.226146e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.260909e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.260909e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.351586e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.359293e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.359293e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.992337 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    12,629,951,963      cycles                           #    2.926 GHz                    
-    29,269,734,483      instructions                     #    2.32  insn per cycle         
-       4.375813430 seconds time elapsed
+TOTAL       :     2.006826 sec
+     6,868,164,513      cycles                           #    2.869 GHz                       
+    12,771,043,874      instructions                     #    1.86  insn per cycle            
+       2.451670895 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 9.872263e-03
-Avg ME (F77/GPU)   = 9.8722595284406640E-003
-Relative difference = 3.5164777671934515e-07
+Avg ME (F77/GPU)   = 9.8722595284406675E-003
+Relative difference = 3.5164777636791134e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.889828e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.890068e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.890068e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.508335e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.508560e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.508560e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.696425 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    18,936,809,312      cycles                           #    2.827 GHz                    
-    53,907,854,112      instructions                     #    2.85  insn per cycle         
-       6.700731218 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32424) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.038136 sec
+    18,717,847,899      cycles                           #    2.659 GHz                       
+    53,598,418,673      instructions                     #    2.86  insn per cycle            
+       7.042371275 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.586455e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.586548e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.586548e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.418673e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.418747e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.418747e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.330534 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     9,805,857,457      cycles                           #    2.941 GHz                    
-    27,153,288,385      instructions                     #    2.77  insn per cycle         
-       3.335034911 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96492) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.725271 sec
+     9,999,898,907      cycles                           #    2.682 GHz                       
+    27,154,408,541      instructions                     #    2.72  insn per cycle            
+       3.729470107 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.386158e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.386550e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.386550e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.288517e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.288903e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.288903e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.562759 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,284,138,212      cycles                           #    2.735 GHz                    
-     9,593,930,746      instructions                     #    2.24  insn per cycle         
-       1.567182963 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84961) (512y:    0) (512z:    0)
+TOTAL       :     1.608418 sec
+     4,321,971,855      cycles                           #    2.681 GHz                       
+     9,593,457,987      instructions                     #    2.22  insn per cycle            
+       1.612824235 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84998) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285411531E-003
 Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.892770e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.893321e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.893321e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.731794e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.732300e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.732300e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.359134 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,729,884,424      cycles                           #    2.737 GHz                    
-     8,517,697,790      instructions                     #    2.28  insn per cycle         
-       1.363667603 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80609) (512y:   90) (512z:    0)
+TOTAL       :     1.417269 sec
+     3,781,284,257      cycles                           #    2.661 GHz                       
+     8,518,492,306      instructions                     #    2.25  insn per cycle            
+       1.421504706 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80598) (512y:   55) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285411531E-003
 Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.423206e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.423718e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.423718e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.320041e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.320569e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.320569e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.547281 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,698,269,777      cycles                           #    1.739 GHz                    
-     4,283,935,635      instructions                     #    1.59  insn per cycle         
-       1.552053679 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2856) (512y:  102) (512z:79114)
+TOTAL       :     1.593109 sec
+     2,718,981,575      cycles                           #    1.703 GHz                       
+     4,277,734,000      instructions                     #    1.57  insn per cycle            
+       1.597391554 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2866) (512y:   71) (512z:79097)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285411531E-003
 Relative difference = 3.516375977906115e-07
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
index bc67f5cacf..a8f385308e 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,215 +25,189 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:08:38
+DATE: 2025-10-11_15:31:21
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.058591e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.058974e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.059077e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.314413e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.318852e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.319620e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.451568 sec
-INFO: No Floating Point Exceptions have been reported
-     8,115,809,761      cycles                           #    2.919 GHz                    
-    18,292,352,744      instructions                     #    2.25  insn per cycle         
-       2.835762935 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.824375 sec
+     3,263,300,002      cycles                           #    2.859 GHz                       
+     5,743,287,797      instructions                     #    1.76  insn per cycle            
+       1.201709138 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.228388e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.230439e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.230672e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.342823e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.343338e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.343373e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     4.019291 sec
-INFO: No Floating Point Exceptions have been reported
-    12,725,284,497      cycles                           #    2.922 GHz                    
-    29,505,773,730      instructions                     #    2.32  insn per cycle         
-       4.410068917 seconds time elapsed
+TOTAL       :     2.030004 sec
+     6,944,802,894      cycles                           #    2.872 GHz                       
+    14,733,879,509      instructions                     #    2.12  insn per cycle            
+       2.474432206 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 9.872263e-03
-Avg ME (F77/GPU)   = 9.8722595284406640E-003
-Relative difference = 3.5164777671934515e-07
+Avg ME (F77/GPU)   = 9.8722595284406675E-003
+Relative difference = 3.5164777636791134e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.905987e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.906203e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.906203e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.570860e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.571065e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.571065e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.685741 sec
-INFO: No Floating Point Exceptions have been reported
-    18,901,791,742      cycles                           #    2.826 GHz                    
-    53,936,334,501      instructions                     #    2.85  insn per cycle         
-       6.689520607 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32022) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.976560 sec
+    18,730,478,677      cycles                           #    2.684 GHz                       
+    53,589,432,540      instructions                     #    2.86  insn per cycle            
+       6.980695916 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.555988e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.556078e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.556078e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.411301e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.411372e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.411372e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.395185 sec
-INFO: No Floating Point Exceptions have been reported
-     9,954,308,036      cycles                           #    2.929 GHz                    
-    27,130,330,125      instructions                     #    2.73  insn per cycle         
-       3.399134205 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96368) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.742394 sec
+    10,077,544,611      cycles                           #    2.691 GHz                       
+    27,148,181,137      instructions                     #    2.69  insn per cycle            
+       3.746519189 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96336) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.364235e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.364649e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.364649e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.358190e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.358704e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.358704e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.571658 sec
-INFO: No Floating Point Exceptions have been reported
-     4,284,967,782      cycles                           #    2.721 GHz                    
-     9,585,542,173      instructions                     #    2.24  insn per cycle         
-       1.575575323 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84968) (512y:    0) (512z:    0)
+TOTAL       :     1.574465 sec
+     4,261,924,263      cycles                           #    2.701 GHz                       
+     9,596,051,273      instructions                     #    2.25  insn per cycle            
+       1.578699681 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85013) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285411531E-003
 Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.898680e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.899276e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.899276e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.774770e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.775320e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.775320e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.358371 sec
-INFO: No Floating Point Exceptions have been reported
-     3,717,774,700      cycles                           #    2.731 GHz                    
-     8,507,853,536      instructions                     #    2.29  insn per cycle         
-       1.362296235 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80632) (512y:  240) (512z:    0)
+TOTAL       :     1.400584 sec
+     3,755,242,155      cycles                           #    2.675 GHz                       
+     8,521,276,194      instructions                     #    2.27  insn per cycle            
+       1.404663616 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80635) (512y:  225) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285411531E-003
 Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.399522e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.400013e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.400013e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.329909e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.330461e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.330461e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.555521 sec
-INFO: No Floating Point Exceptions have been reported
-     2,693,302,897      cycles                           #    1.729 GHz                    
-     4,281,674,096      instructions                     #    1.59  insn per cycle         
-       1.559394081 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2693) (512y:  184) (512z:79098)
+TOTAL       :     1.587980 sec
+     2,712,476,158      cycles                           #    1.704 GHz                       
+     4,282,456,457      instructions                     #    1.58  insn per cycle            
+       1.592350341 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2702) (512y:  175) (512z:79107)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285411531E-003
 Relative difference = 3.516375977906115e-07
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..2d50000d27
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,118 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+DATE: 2025-10-11_15:49:04
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+3.189617e+04    1 256
+3.247454e+04    2 256
+3.572888e+04    4 256
+3.576406e+04    8 256
+3.574054e+04   16 256
+3.604686e+04   32 256
+3.591831e+04   64 256
+3.590498e+04  128 256
+3.586335e+04  256 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+7.716223e+03    1  32
+1.405251e+04    2  32
+2.073573e+04    4  32
+2.779764e+04    8  32
+3.326750e+04   16  32
+3.550921e+04   32  32
+3.542979e+04   64  32
+3.536735e+04  128  32
+3.605303e+04  256  32
+3.612470e+04  512  32
+3.604579e+04 1024  32
+3.604477e+04 2048  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.499895e+01    1 256
+8.500354e+01    2 256
+8.502793e+01    4 256
+### CPU: scaling test 32
+8.566387e+01    1  32
+8.564579e+01    2  32
+8.546968e+01    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.082111e+02    1 256
+3.057097e+02    2 256
+3.015791e+02    4 256
+### CPU: scaling test 32
+3.031632e+02    1  32
+3.047989e+02    2  32
+3.016953e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.617272e+02    1 256
+6.661900e+02    2 256
+6.680386e+02    4 256
+### CPU: scaling test 32
+6.677614e+02    1  32
+6.719546e+02    2  32
+6.659846e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.611249e+02    1 256
+7.606905e+02    2 256
+7.604096e+02    4 256
+### CPU: scaling test 32
+7.550844e+02    1  32
+7.531491e+02    2  32
+7.562334e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.623690e+02    1 256
+6.648693e+02    2 256
+6.677195e+02    4 256
+### CPU: scaling test 32
+6.549910e+02    1  32
+6.592485e+02    2  32
+6.593529e+02    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index e477be7c61..8d906ea4bc 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,215 +25,189 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:13:00
+DATE: 2025-10-11_15:36:41
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.207250e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.207995e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.208247e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.762040 sec
-INFO: No Floating Point Exceptions have been reported
-     5,937,636,063      cycles                           #    2.916 GHz                    
-    12,374,083,331      instructions                     #    2.08  insn per cycle         
-       2.091996677 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 3.066576e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.085305e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.089254e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824900e-06 )  GeV^-6
+TOTAL       :     0.755600 sec
+     2,946,115,284      cycles                           #    2.846 GHz                       
+     5,005,757,693      instructions                     #    1.70  insn per cycle            
+       1.092047091 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.149439e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.150073e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.150179e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.576872e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.578746e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.578931e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333437e-05 )  GeV^-6
-TOTAL       :     2.066345 sec
-INFO: No Floating Point Exceptions have been reported
-     6,803,203,568      cycles                           #    2.918 GHz                    
-    14,656,096,283      instructions                     #    2.15  insn per cycle         
-       2.390130877 seconds time elapsed
+TOTAL       :     1.197902 sec
+     4,252,156,323      cycles                           #    2.858 GHz                       
+     7,968,205,533      instructions                     #    1.87  insn per cycle            
+       1.544878632 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 9.849635e-03
-Avg ME (F77/GPU)   = 9.8712451931260159E-003
-Relative difference = 0.0021940095370046923
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 9.849633e-03
+Avg ME (F77/GPU)   = 9.8712433304319249E-003
+Relative difference = 0.0021940239227111213
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.548424e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.548685e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.548685e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.452149e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.452401e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.452401e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.179003 sec
-INFO: No Floating Point Exceptions have been reported
-    18,168,840,210      cycles                           #    2.939 GHz                    
-    53,911,011,794      instructions                     #    2.97  insn per cycle         
-       6.183081263 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.250789 sec
+    18,004,786,092      cycles                           #    2.879 GHz                       
+    53,363,354,008      instructions                     #    2.96  insn per cycle            
+       6.254568811 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087551509E-003
-Relative difference = 2.119780432912131e-08
+Avg ME (F77/C++)    = 9.8479612087517612E-003
+Relative difference = 2.1197460131000295e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.395658e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.396067e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.396067e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.083892e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.084249e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.084249e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.556967 sec
-INFO: No Floating Point Exceptions have been reported
-     4,597,936,627      cycles                           #    2.947 GHz                    
-    13,808,300,252      instructions                     #    3.00  insn per cycle         
-       1.560798930 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:97016) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.714898 sec
+     4,637,516,396      cycles                           #    2.699 GHz                       
+    13,808,277,295      instructions                     #    2.98  insn per cycle            
+       1.718840547 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847955e-03
 Avg ME (F77/C++)    = 9.8479546896367235E-003
 Relative difference = 3.1515505172940424e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.833708e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.835461e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.835461e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.679481e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.681146e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.681146e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.774770 sec
-INFO: No Floating Point Exceptions have been reported
-     2,127,367,774      cycles                           #    2.734 GHz                    
-     4,836,875,487      instructions                     #    2.27  insn per cycle         
-       0.778636721 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85494) (512y:    0) (512z:    0)
+TOTAL       :     0.793237 sec
+     2,148,565,219      cycles                           #    2.697 GHz                       
+     4,837,105,097      instructions                     #    2.25  insn per cycle            
+       0.797286288 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85530) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
 Avg ME (F77/C++)    = 9.8929728161091246E-003
 Relative difference = 1.8588029579156084e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.729108e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.731291e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.731291e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.502213e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.504225e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.504225e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.685221 sec
-INFO: No Floating Point Exceptions have been reported
-     1,884,703,570      cycles                           #    2.737 GHz                    
-     4,291,263,737      instructions                     #    2.28  insn per cycle         
-       0.689203509 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81183) (512y:   45) (512z:    0)
+TOTAL       :     0.706205 sec
+     1,896,245,897      cycles                           #    2.672 GHz                       
+     4,291,845,754      instructions                     #    2.26  insn per cycle            
+       0.710269657 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81171) (512y:   10) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
 Avg ME (F77/C++)    = 9.8929728161091246E-003
 Relative difference = 1.8588029579156084e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.870048e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.872187e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.872187e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.536289e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.538258e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.538258e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.771101 sec
-INFO: No Floating Point Exceptions have been reported
-     1,354,646,750      cycles                           #    1.748 GHz                    
-     2,162,779,823      instructions                     #    1.60  insn per cycle         
-       0.775438585 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3481) (512y:   45) (512z:79330)
+TOTAL       :     0.810162 sec
+     1,363,414,955      cycles                           #    1.676 GHz                       
+     2,159,791,218      instructions                     #    1.58  insn per cycle            
+       0.814367082 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3501) (512y:   15) (512z:79315)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892981e-03
 Avg ME (F77/C++)    = 9.8929811982676284E-003
 Relative difference = 2.004124217057488e-08
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..b311421434
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,118 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+DATE: 2025-10-11_16:05:58
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+3.033893e+04    1 256
+3.187494e+04    2 256
+3.481987e+04    4 256
+3.512251e+04    8 256
+3.538857e+04   16 256
+3.542822e+04   32 256
+3.543221e+04   64 256
+3.537512e+04  128 256
+3.502452e+04  256 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+7.725986e+03    1  32
+1.328194e+04    2  32
+1.942036e+04    4  32
+2.633854e+04    8  32
+3.294887e+04   16  32
+3.493545e+04   32  32
+3.529299e+04   64  32
+3.546637e+04  128  32
+3.548686e+04  256  32
+3.523534e+04  512  32
+3.522952e+04 1024  32
+3.514012e+04 2048  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.495344e+01    1 256
+8.539448e+01    2 256
+8.496927e+01    4 256
+### CPU: scaling test 32
+8.470460e+01    1  32
+8.470926e+01    2  32
+8.506051e+01    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.029024e+02    1 256
+3.058068e+02    2 256
+3.092272e+02    4 256
+### CPU: scaling test 32
+3.088673e+02    1  32
+3.061911e+02    2  32
+3.071123e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.653819e+02    1 256
+6.661146e+02    2 256
+6.676979e+02    4 256
+### CPU: scaling test 32
+6.681941e+02    1  32
+6.675336e+02    2  32
+6.688978e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.615474e+02    1 256
+7.624411e+02    2 256
+7.580407e+02    4 256
+### CPU: scaling test 32
+7.724123e+02    1  32
+7.622893e+02    2  32
+7.629688e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.726799e+02    1 256
+6.675111e+02    2 256
+6.619522e+02    4 256
+### CPU: scaling test 32
+6.616673e+02    1  32
+6.588386e+02    2  32
+6.622712e+02    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
index 09d523a948..66637c5d79 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,239 +25,197 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:42:06
+DATE: 2025-10-11_16:34:27
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.291704e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.296560e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.296560e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187093e-05 +- 9.825663e-06 )  GeV^-6
-TOTAL       :     1.680127 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     5,675,085,647      cycles                           #    2.923 GHz                    
-    11,509,492,893      instructions                     #    2.03  insn per cycle         
-       1.997903242 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge
+EvtsPerSec[Rmb+ME]     (23) = ( 2.846569e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.930073e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.930073e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187094e-05 +- 9.825664e-06 )  GeV^-6
+TOTAL       :     0.744004 sec
+     2,812,928,508      cycles                           #    2.768 GHz                       
+     4,058,280,243      instructions                     #    1.44  insn per cycle            
+       1.074142514 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.120892e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.132073e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.132073e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 1.856440e-04 +- 8.331091e-05 )  GeV^-6
-TOTAL       :     2.037220 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,712,310,342      cycles                           #    2.924 GHz                    
-    13,777,135,261      instructions                     #    2.05  insn per cycle         
-       2.354099539 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 3.542471e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.575116e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.575116e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 1.856440e-04 +- 8.331090e-05 )  GeV^-6
+TOTAL       :     1.186896 sec
+     4,180,690,234      cycles                           #    2.849 GHz                       
+     8,037,777,996      instructions                     #    1.92  insn per cycle            
+       1.534789099 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 9.849635e-03
-Avg ME (F77/GPU)   = 9.8712451931260159E-003
-Relative difference = 0.0021940095370046923
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 9.849633e-03
+Avg ME (F77/GPU)   = 9.8712433304319249E-003
+Relative difference = 0.0021940239227111213
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.574125e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.574397e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.574397e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.504304e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.504560e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.504560e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.159980 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    18,121,008,944      cycles                           #    2.940 GHz                    
-    53,916,989,652      instructions                     #    2.98  insn per cycle         
-       6.164330765 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.212057 sec
+    17,925,660,588      cycles                           #    2.884 GHz                       
+    53,364,413,300      instructions                     #    2.98  insn per cycle            
+       6.216192253 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087551509E-003
-Relative difference = 2.119780432912131e-08
+Avg ME (F77/C++)    = 9.8479612087517612E-003
+Relative difference = 2.1197460131000295e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.371688e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.372089e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.372089e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.026780e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.027128e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.027128e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.568419 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,624,959,734      cycles                           #    2.942 GHz                    
-    13,809,578,618      instructions                     #    2.99  insn per cycle         
-       1.572870258 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:97016) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.746031 sec
+     4,640,321,340      cycles                           #    2.653 GHz                       
+    13,810,267,539      instructions                     #    2.98  insn per cycle            
+       1.750270483 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847955e-03
 Avg ME (F77/C++)    = 9.8479546896367235E-003
 Relative difference = 3.1515505172940424e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.853120e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.854860e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.854860e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.541416e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.543021e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.543021e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.772760 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,127,660,282      cycles                           #    2.740 GHz                    
-     4,839,303,130      instructions                     #    2.27  insn per cycle         
-       0.777110537 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85494) (512y:    0) (512z:    0)
+TOTAL       :     0.809578 sec
+     2,161,931,873      cycles                           #    2.659 GHz                       
+     4,839,517,439      instructions                     #    2.24  insn per cycle            
+       0.813642934 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85530) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
 Avg ME (F77/C++)    = 9.8929728161091246E-003
 Relative difference = 1.8588029579156084e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.707103e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.709607e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.709607e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.420966e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.422988e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.422988e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.687680 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,889,891,398      cycles                           #    2.733 GHz                    
-     4,293,271,631      instructions                     #    2.27  insn per cycle         
-       0.692031150 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81183) (512y:   45) (512z:    0)
+TOTAL       :     0.714158 sec
+     1,911,038,749      cycles                           #    2.664 GHz                       
+     4,293,943,131      instructions                     #    2.25  insn per cycle            
+       0.718267339 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81171) (512y:   10) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
 Avg ME (F77/C++)    = 9.8929728161091246E-003
 Relative difference = 1.8588029579156084e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.738421e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.740575e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.740575e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.647126e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.649133e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.649133e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.785848 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,358,106,687      cycles                           #    1.720 GHz                    
-     2,165,384,980      instructions                     #    1.59  insn per cycle         
-       0.790493646 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3481) (512y:   45) (512z:79330)
+TOTAL       :     0.797274 sec
+     1,365,650,123      cycles                           #    1.706 GHz                       
+     2,161,762,081      instructions                     #    1.58  insn per cycle            
+       0.801641364 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3501) (512y:   15) (512z:79315)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892981e-03
 Avg ME (F77/C++)    = 9.8929811982676284E-003
 Relative difference = 2.004124217057488e-08
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
index 33a64296d4..a85d1bcb39 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,215 +25,189 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:14:03
+DATE: 2025-10-11_15:38:06
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.196404e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.197145e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.197475e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.762965 sec
-INFO: No Floating Point Exceptions have been reported
-     5,951,937,078      cycles                           #    2.924 GHz                    
-    11,910,577,864      instructions                     #    2.00  insn per cycle         
-       2.092003198 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 3.071043e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.090506e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.094612e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824900e-06 )  GeV^-6
+TOTAL       :     0.757789 sec
+     2,958,910,358      cycles                           #    2.847 GHz                       
+     4,794,775,632      instructions                     #    1.62  insn per cycle            
+       1.096595085 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.150073e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.150749e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.150840e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.567606e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.569510e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.569696e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333437e-05 )  GeV^-6
-TOTAL       :     2.074025 sec
-INFO: No Floating Point Exceptions have been reported
-     6,857,187,374      cycles                           #    2.930 GHz                    
-    14,190,515,168      instructions                     #    2.07  insn per cycle         
-       2.396988151 seconds time elapsed
+TOTAL       :     1.206702 sec
+     4,225,242,901      cycles                           #    2.841 GHz                       
+     8,156,770,765      instructions                     #    1.93  insn per cycle            
+       1.554101217 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 9.849635e-03
-Avg ME (F77/GPU)   = 9.8712451931260107E-003
-Relative difference = 0.0021940095370041636
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 9.849633e-03
+Avg ME (F77/GPU)   = 9.8712433304319249E-003
+Relative difference = 0.0021940239227111213
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.597266e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.597536e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.597536e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.507145e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.507418e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.507418e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.144692 sec
-INFO: No Floating Point Exceptions have been reported
-    18,086,727,911      cycles                           #    2.942 GHz                    
-    53,895,836,183      instructions                     #    2.98  insn per cycle         
-       6.148512893 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.208388 sec
+    17,992,278,108      cycles                           #    2.897 GHz                       
+    53,336,143,963      instructions                     #    2.96  insn per cycle            
+       6.212278042 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20135) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087572898E-003
-Relative difference = 2.1198021522715588e-08
+Avg ME (F77/C++)    = 9.8479612087558014E-003
+Relative difference = 2.119787038556726e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.388656e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.389069e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.389069e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.069142e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.069523e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.069523e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.560721 sec
-INFO: No Floating Point Exceptions have been reported
-     4,571,260,015      cycles                           #    2.924 GHz                    
-    13,800,942,063      instructions                     #    3.02  insn per cycle         
-       1.564719207 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96651) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.722052 sec
+     4,637,939,725      cycles                           #    2.688 GHz                       
+    13,805,971,610      instructions                     #    2.98  insn per cycle            
+       1.726097842 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96840) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847955e-03
 Avg ME (F77/C++)    = 9.8479546896065809E-003
 Relative difference = 3.151856596628469e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.702410e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.704003e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.704003e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.610751e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.612520e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.612520e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.789887 sec
-INFO: No Floating Point Exceptions have been reported
-     2,151,012,254      cycles                           #    2.712 GHz                    
-     4,840,938,021      instructions                     #    2.25  insn per cycle         
-       0.793816354 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85884) (512y:    0) (512z:    0)
+TOTAL       :     0.800943 sec
+     2,170,709,754      cycles                           #    2.698 GHz                       
+     4,844,490,730      instructions                     #    2.23  insn per cycle            
+       0.805141444 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85852) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
 Avg ME (F77/C++)    = 9.8929728161091923E-003
 Relative difference = 1.85880227405429e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.657646e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.659745e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.659745e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.606901e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.608951e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.608951e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.691425 sec
-INFO: No Floating Point Exceptions have been reported
-     1,894,431,690      cycles                           #    2.727 GHz                    
-     4,294,884,277      instructions                     #    2.27  insn per cycle         
-       0.695223368 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81725) (512y:   25) (512z:    0)
+TOTAL       :     0.696038 sec
+     1,884,685,200      cycles                           #    2.695 GHz                       
+     4,299,634,626      instructions                     #    2.28  insn per cycle            
+       0.700035846 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81642) (512y:   10) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
 Avg ME (F77/C++)    = 9.8929728161091923E-003
 Relative difference = 1.85880227405429e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.673392e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.675470e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.675470e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.489547e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.491608e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.491608e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.793743 sec
-INFO: No Floating Point Exceptions have been reported
-     1,366,656,580      cycles                           #    1.715 GHz                    
-     2,169,713,805      instructions                     #    1.59  insn per cycle         
-       0.797745119 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4092) (512y:   32) (512z:79551)
+TOTAL       :     0.816037 sec
+     1,366,505,808      cycles                           #    1.668 GHz                       
+     2,169,050,969      instructions                     #    1.59  insn per cycle            
+       0.820326650 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4103) (512y:   24) (512z:79552)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892981e-03
 Avg ME (F77/C++)    = 9.8929811982957326E-003
 Relative difference = 2.0044082998332894e-08
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..53bb1cfda7
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,118 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+DATE: 2025-10-11_15:47:09
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.616958e+04    1 256
+1.637015e+04    2 256
+1.727451e+04    4 256
+1.703878e+04    8 256
+1.713757e+04   16 256
+1.692549e+04   32 256
+1.662520e+04   64 256
+1.655737e+04  128 256
+1.660158e+04  256 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+6.521951e+03    1  32
+1.124531e+04    2  32
+1.474858e+04    4  32
+1.618404e+04    8  32
+1.651807e+04   16  32
+1.695250e+04   32  32
+1.681150e+04   64  32
+1.629231e+04  128  32
+1.600637e+04  256  32
+1.595680e+04  512  32
+1.609152e+04 1024  32
+1.606225e+04 2048  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.530837e+01    1 256
+7.486415e+01    2 256
+7.494008e+01    4 256
+### CPU: scaling test 32
+7.525282e+01    1  32
+7.477017e+01    2  32
+7.524610e+01    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.548840e+02    1 256
+1.522353e+02    2 256
+1.543201e+02    4 256
+### CPU: scaling test 32
+1.576268e+02    1  32
+1.582873e+02    2  32
+1.506909e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.557154e+02    1 256
+3.547270e+02    2 256
+3.557554e+02    4 256
+### CPU: scaling test 32
+3.614135e+02    1  32
+3.600100e+02    2  32
+3.596141e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.001766e+02    1 256
+4.125953e+02    2 256
+4.090213e+02    4 256
+### CPU: scaling test 32
+4.084924e+02    1  32
+4.056804e+02    2  32
+4.080579e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.519966e+02    1 256
+3.510473e+02    2 256
+3.460383e+02    4 256
+### CPU: scaling test 32
+3.459963e+02    1  32
+3.417875e+02    2  32
+3.469620e+02    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index a2a6307c02..686f1c46c7 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,215 +25,189 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:10:06
+DATE: 2025-10-11_15:33:09
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.665934e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.666477e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.666666e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.606719e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.613205e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.614399e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.202831 sec
-INFO: No Floating Point Exceptions have been reported
-     7,373,914,452      cycles                           #    2.913 GHz                    
-    16,351,055,335      instructions                     #    2.22  insn per cycle         
-       2.588547453 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.810711 sec
+     3,229,171,179      cycles                           #    2.859 GHz                       
+     5,715,641,917      instructions                     #    1.77  insn per cycle            
+       1.191471752 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.110897e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.111188e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111222e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.654245e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.655018e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.655075e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.442430 sec
-INFO: No Floating Point Exceptions have been reported
-    11,070,694,428      cycles                           #    2.924 GHz                    
-    25,628,142,124      instructions                     #    2.31  insn per cycle         
-       3.841933628 seconds time elapsed
+TOTAL       :     1.784420 sec
+     6,293,809,246      cycles                           #    2.879 GHz                       
+    12,593,045,017      instructions                     #    2.00  insn per cycle            
+       2.242570146 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 9.872263e-03
-Avg ME (F77/GPU)   = 9.8722599015656498E-003
-Relative difference = 3.1385249252060663e-07
+Avg ME (F77/GPU)   = 9.8722595419029543E-003
+Relative difference = 3.502841288596502e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.567548e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.567783e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.567783e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.469254e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.469466e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.469466e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.978728 sec
-INFO: No Floating Point Exceptions have been reported
-    19,201,924,470      cycles                           #    2.751 GHz                    
-    54,137,446,015      instructions                     #    2.82  insn per cycle         
-       6.982563293 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32000) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.071086 sec
+    19,047,832,122      cycles                           #    2.693 GHz                       
+    53,831,188,921      instructions                     #    2.83  insn per cycle            
+       7.075248115 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595861831675E-003
 Relative difference = 3.457988134687711e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.526848e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.526939e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.526939e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.520487e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.520570e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.520570e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.460419 sec
-INFO: No Floating Point Exceptions have been reported
-     9,442,620,757      cycles                           #    2.727 GHz                    
-    26,188,001,033      instructions                     #    2.77  insn per cycle         
-       3.464377416 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96049) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.474834 sec
+     9,355,185,296      cycles                           #    2.691 GHz                       
+    25,920,357,243      instructions                     #    2.77  insn per cycle            
+       3.478986906 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96092) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594844308162E-003
 Relative difference = 3.5610570575237004e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.548969e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.549418e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.549418e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.467313e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.467816e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.467816e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.489614 sec
-INFO: No Floating Point Exceptions have been reported
-     4,075,741,004      cycles                           #    2.731 GHz                    
-     9,249,825,182      instructions                     #    2.27  insn per cycle         
-       1.493453651 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84390) (512y:    0) (512z:    0)
+TOTAL       :     1.523962 sec
+     3,999,825,927      cycles                           #    2.619 GHz                       
+     9,105,365,579      instructions                     #    2.28  insn per cycle            
+       1.528167166 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:83929) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.098256e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.098850e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.098850e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.083261e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.083882e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.083882e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.290484 sec
-INFO: No Floating Point Exceptions have been reported
-     3,523,951,603      cycles                           #    2.724 GHz                    
-     8,183,239,467      instructions                     #    2.32  insn per cycle         
-       1.294382992 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80015) (512y:   80) (512z:    0)
+TOTAL       :     1.295937 sec
+     3,509,301,061      cycles                           #    2.701 GHz                       
+     8,040,567,810      instructions                     #    2.29  insn per cycle            
+       1.299964950 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:79768) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.495372e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.495944e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.495944e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.452173e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.452727e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.452727e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.513924 sec
-INFO: No Floating Point Exceptions have been reported
-     2,658,314,764      cycles                           #    1.752 GHz                    
-     4,173,156,780      instructions                     #    1.57  insn per cycle         
-       1.517996809 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   92) (512z:78910)
+TOTAL       :     1.532017 sec
+     2,596,809,497      cycles                           #    1.691 GHz                       
+     4,060,850,927      instructions                     #    1.56  insn per cycle            
+       1.536186135 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2509) (512y:   61) (512z:78957)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..a739246eca
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,118 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+DATE: 2025-10-11_16:03:38
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.525607e+04    1 256
+1.592603e+04    2 256
+1.694297e+04    4 256
+1.694752e+04    8 256
+1.680152e+04   16 256
+1.667228e+04   32 256
+1.648853e+04   64 256
+1.642335e+04  128 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+5.344354e+03    1  32
+9.059524e+03    2  32
+1.316587e+04    4  32
+1.535902e+04    8  32
+1.599627e+04   16  32
+1.690040e+04   32  32
+1.613824e+04   64  32
+1.606066e+04  128  32
+1.607094e+04  256  32
+1.586333e+04  512  32
+1.570749e+04 1024  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.451618e+01    1 256
+7.447961e+01    2 256
+7.464296e+01    4 256
+### CPU: scaling test 32
+7.454429e+01    1  32
+7.454562e+01    2  32
+7.491906e+01    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.523430e+02    1 256
+1.528849e+02    2 256
+1.545423e+02    4 256
+### CPU: scaling test 32
+1.508465e+02    1  32
+1.522871e+02    2  32
+1.514789e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.569891e+02    1 256
+3.579373e+02    2 256
+3.580811e+02    4 256
+### CPU: scaling test 32
+3.582840e+02    1  32
+3.591263e+02    2  32
+3.590191e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.091335e+02    1 256
+4.101923e+02    2 256
+4.047677e+02    4 256
+### CPU: scaling test 32
+4.052367e+02    1  32
+4.049500e+02    2  32
+4.058871e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.457958e+02    1 256
+3.518110e+02    2 256
+3.523691e+02    4 256
+### CPU: scaling test 32
+3.457462e+02    1  32
+3.517526e+02    2  32
+3.507713e+02    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
index 67fff86657..2c63694669 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,215 +25,189 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:11:33
+DATE: 2025-10-11_15:34:55
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.667678e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.668217e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.668387e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.591312e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.597916e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.599015e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.202686 sec
-INFO: No Floating Point Exceptions have been reported
-     7,336,606,843      cycles                           #    2.899 GHz                    
-    15,241,236,080      instructions                     #    2.08  insn per cycle         
-       2.586897924 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.809629 sec
+     3,237,669,928      cycles                           #    2.864 GHz                       
+     5,681,011,752      instructions                     #    1.75  insn per cycle            
+       1.192308721 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.107552e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.107855e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.107889e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.667525e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.668322e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.668373e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.440073 sec
-INFO: No Floating Point Exceptions have been reported
-    11,052,276,434      cycles                           #    2.923 GHz                    
-    25,411,180,343      instructions                     #    2.30  insn per cycle         
-       3.836365671 seconds time elapsed
+TOTAL       :     1.762250 sec
+     6,151,588,956      cycles                           #    2.862 GHz                       
+    12,789,871,898      instructions                     #    2.08  insn per cycle            
+       2.206834958 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 9.872263e-03
-Avg ME (F77/GPU)   = 9.8722599015656498E-003
-Relative difference = 3.1385249252060663e-07
+Avg ME (F77/GPU)   = 9.8722595419029543E-003
+Relative difference = 3.502841288596502e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.653903e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.654105e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.654105e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.441824e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.442030e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.442030e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.889827 sec
-INFO: No Floating Point Exceptions have been reported
-    19,201,166,017      cycles                           #    2.786 GHz                    
-    54,161,677,415      instructions                     #    2.82  insn per cycle         
-       6.893652512 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32202) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.097119 sec
+    19,021,241,015      cycles                           #    2.679 GHz                       
+    53,824,218,201      instructions                     #    2.83  insn per cycle            
+       7.101056562 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595861831675E-003
 Relative difference = 3.457988134687711e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.552412e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.552503e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.552503e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.520581e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.520672e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.520672e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.403221 sec
-INFO: No Floating Point Exceptions have been reported
-     9,295,420,050      cycles                           #    2.729 GHz                    
-    26,089,296,035      instructions                     #    2.81  insn per cycle         
-       3.407123949 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:95935) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.473548 sec
+     9,360,233,363      cycles                           #    2.692 GHz                       
+    25,827,022,283      instructions                     #    2.76  insn per cycle            
+       3.477681834 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:95883) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594844308162E-003
 Relative difference = 3.5610570575237004e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.556434e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.556900e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.556900e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.499910e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.500338e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.500338e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.488620 sec
-INFO: No Floating Point Exceptions have been reported
-     4,059,104,235      cycles                           #    2.721 GHz                    
-     9,213,839,753      instructions                     #    2.27  insn per cycle         
-       1.492560916 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:83864) (512y:    0) (512z:    0)
+TOTAL       :     1.510429 sec
+     4,054,458,858      cycles                           #    2.678 GHz                       
+     9,070,411,764      instructions                     #    2.24  insn per cycle            
+       1.514545882 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:83452) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.125241e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.125840e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.125840e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.057773e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.058358e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.058358e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.282211 sec
-INFO: No Floating Point Exceptions have been reported
-     3,511,408,538      cycles                           #    2.732 GHz                    
-     8,168,208,932      instructions                     #    2.33  insn per cycle         
-       1.286095846 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:79421) (512y:  230) (512z:    0)
+TOTAL       :     1.302962 sec
+     3,492,520,706      cycles                           #    2.673 GHz                       
+     8,024,600,361      instructions                     #    2.30  insn per cycle            
+       1.307117868 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:79136) (512y:  215) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.517573e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.518129e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.518129e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.494027e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.494558e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.494558e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.503444 sec
-INFO: No Floating Point Exceptions have been reported
-     2,622,176,822      cycles                           #    1.740 GHz                    
-     4,167,750,292      instructions                     #    1.59  insn per cycle         
-       1.507552292 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1879) (512y:  174) (512z:78884)
+TOTAL       :     1.513587 sec
+     2,591,602,459      cycles                           #    1.708 GHz                       
+     4,056,631,617      instructions                     #    1.57  insn per cycle            
+       1.517867253 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1776) (512y:  165) (512z:78888)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..f1df17a77c
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+DATE: 2025-10-11_15:44:03
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.428635e+06    1 256
+2.986921e+06    2 256
+5.564976e+06    4 256
+1.150400e+07    8 256
+2.254241e+07   16 256
+3.299328e+07   32 256
+3.991678e+07   64 256
+4.342243e+07  128 256
+4.801742e+07  256 256
+5.029240e+07  512 256
+5.134165e+07 1024 256
+### GPU: scaling test 32
+1.949995e+05    1  32
+3.776925e+05    2  32
+7.282783e+05    4  32
+1.483318e+06    8  32
+2.934652e+06   16  32
+4.620001e+06   32  32
+1.110479e+07   64  32
+2.248141e+07  128  32
+3.497298e+07  256  32
+3.843258e+07  512  32
+4.371853e+07 1024  32
+4.702509e+07 2048  32
+4.914143e+07 4096  32
+5.007560e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.018202e+05    1 256
+1.029861e+05    2 256
+1.049904e+05    4 256
+### CPU: scaling test 32
+9.750093e+04    1  32
+9.993083e+04    2  32
+1.029180e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.770505e+05    1 256
+1.765797e+05    2 256
+1.854054e+05    4 256
+### CPU: scaling test 32
+1.484850e+05    1  32
+1.713608e+05    2  32
+1.595040e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.857545e+05    1 256
+3.168191e+05    2 256
+3.177122e+05    4 256
+### CPU: scaling test 32
+2.953038e+05    1  32
+3.077116e+05    2  32
+2.876185e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.080307e+05    1 256
+3.180421e+05    2 256
+3.341884e+05    4 256
+### CPU: scaling test 32
+2.868052e+05    1  32
+3.156394e+05    2  32
+3.097819e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.313974e+05    1 256
+2.307900e+05    2 256
+2.293449e+05    4 256
+### CPU: scaling test 32
+2.313560e+05    1  32
+2.290500e+05    2  32
+2.289947e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index 468f6865a8..d112a11495 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:05:51
+DATE: 2025-10-11_15:27:25
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.906944e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.902591e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.013821e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.313564e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.022320e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.232850e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.458221 sec
-INFO: No Floating Point Exceptions have been reported
-     1,930,997,109      cycles                           #    2.858 GHz                    
-     2,724,198,211      instructions                     #    1.41  insn per cycle         
-       0.805328419 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.462516 sec
+     1,997,687,796      cycles                           #    2.814 GHz                       
+     2,748,418,377      instructions                     #    1.38  insn per cycle            
+       0.769002804 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.002453e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.463176e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.675243e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.849800e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.989232e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.162437e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.544639 sec
-INFO: No Floating Point Exceptions have been reported
-     2,250,691,324      cycles                           #    2.871 GHz                    
-     3,190,813,390      instructions                     #    1.42  insn per cycle         
-       0.843484638 seconds time elapsed
+TOTAL       :     0.537675 sec
+     2,303,047,279      cycles                           #    2.838 GHz                       
+     3,173,611,128      instructions                     #    1.38  insn per cycle            
+       0.868680787 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424749e-01
-Avg ME (F77/GPU)   = 0.14247482467490466
-Relative difference = 5.286902838873106e-07
+Avg ME (F77/GPU)   = 0.14247482467490463
+Relative difference = 5.286902840821208e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.052668e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.075406e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.075406e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.039909e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.062156e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.062156e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.578445 sec
-INFO: No Floating Point Exceptions have been reported
-     4,629,037,835      cycles                           #    2.928 GHz                    
-    13,193,545,970      instructions                     #    2.85  insn per cycle         
-       1.584589009 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  707) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.595860 sec
+     4,617,130,408      cycles                           #    2.888 GHz                       
+    13,249,342,927      instructions                     #    2.87  insn per cycle            
+       1.599801948 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  691) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499481
 Relative difference = 5.286896511435107e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.869817e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.940106e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.940106e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.827783e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.896147e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.896147e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.895982 sec
-INFO: No Floating Point Exceptions have been reported
-     2,636,174,950      cycles                           #    2.931 GHz                    
-     7,556,706,256      instructions                     #    2.87  insn per cycle         
-       0.901753059 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.915570 sec
+     2,669,358,674      cycles                           #    2.905 GHz                       
+     7,600,949,147      instructions                     #    2.85  insn per cycle            
+       0.919765484 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499475
 Relative difference = 5.286896515331313e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.170738e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.377041e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.377041e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.046861e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.237725e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.237725e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.538337 sec
-INFO: No Floating Point Exceptions have been reported
-     1,492,365,440      cycles                           #    2.760 GHz                    
-     3,161,633,609      instructions                     #    2.12  insn per cycle         
-       0.543901971 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2991) (512y:    0) (512z:    0)
+TOTAL       :     0.557374 sec
+     1,530,133,486      cycles                           #    2.729 GHz                       
+     3,193,359,124      instructions                     #    2.09  insn per cycle            
+       0.561538714 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3021) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.502118e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.753079e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.753079e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.222833e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.436298e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.436298e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.488254 sec
-INFO: No Floating Point Exceptions have been reported
-     1,345,193,436      cycles                           #    2.734 GHz                    
-     3,015,805,712      instructions                     #    2.24  insn per cycle         
-       0.494320620 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2749) (512y:  104) (512z:    0)
+TOTAL       :     0.527914 sec
+     1,448,845,809      cycles                           #    2.727 GHz                       
+     3,068,216,889      instructions                     #    2.12  insn per cycle            
+       0.532005288 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2827) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.340176e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.450488e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.450488e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.262309e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.366937e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.366937e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.722488 sec
-INFO: No Floating Point Exceptions have been reported
-     1,326,137,037      cycles                           #    1.826 GHz                    
-     1,964,340,659      instructions                     #    1.48  insn per cycle         
-       0.728328312 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1379) (512y:  106) (512z: 2218)
+TOTAL       :     0.746275 sec
+     1,345,907,467      cycles                           #    1.795 GHz                       
+     1,981,512,387      instructions                     #    1.47  insn per cycle            
+       0.750498916 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1439) (512y:   84) (512z: 2209)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
index a32e85fd77..542ec194e9 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,272 +10,231 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:39:10
+DATE: 2025-10-11_16:30:42
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.313371e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.590831e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.590831e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.356662e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.903029e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.903029e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.487212 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,046,207,140      cycles                           #    2.880 GHz                    
-     3,015,907,255      instructions                     #    1.47  insn per cycle         
-       0.769534809 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+TOTAL       :     0.490080 sec
+     2,074,202,921      cycles                           #    2.819 GHz                       
+     2,982,362,559      instructions                     #    1.44  insn per cycle            
+       0.792779275 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.228660e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.270938e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.270938e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.203461e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.181328e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.181328e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.758730 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,917,079,859      cycles                           #    2.883 GHz                    
-     4,489,082,127      instructions                     #    1.54  insn per cycle         
-       1.069078440 seconds time elapsed
+TOTAL       :     0.757533 sec
+     2,979,284,817      cycles                           #    2.853 GHz                       
+     4,399,436,734      instructions                     #    1.48  insn per cycle            
+       1.101470538 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424749e-01
-Avg ME (F77/GPU)   = 0.14247482467490466
-Relative difference = 5.286902838873106e-07
+Avg ME (F77/GPU)   = 0.14247482467490463
+Relative difference = 5.286902840821208e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.058535e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.081557e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.081557e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.040166e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.062990e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.062990e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.574537 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,656,483,821      cycles                           #    2.950 GHz                    
-    13,198,201,576      instructions                     #    2.83  insn per cycle         
-       1.579077435 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  707) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.601584 sec
+     4,649,519,147      cycles                           #    2.897 GHz                       
+    13,253,744,210      instructions                     #    2.85  insn per cycle            
+       1.606011259 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  691) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499481
 Relative difference = 5.286896511435107e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.861172e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.931943e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.931943e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.815648e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.884893e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.884893e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.907508 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,678,662,656      cycles                           #    2.939 GHz                    
-     7,605,263,564      instructions                     #    2.84  insn per cycle         
-       0.912202227 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.929220 sec
+     2,705,069,112      cycles                           #    2.900 GHz                       
+     7,649,258,945      instructions                     #    2.83  insn per cycle            
+       0.933656370 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499475
 Relative difference = 5.286896515331313e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.153263e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.357026e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.357026e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.970773e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.160922e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.160922e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.547067 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,524,781,245      cycles                           #    2.767 GHz                    
-     3,210,388,287      instructions                     #    2.11  insn per cycle         
-       0.551691801 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2991) (512y:    0) (512z:    0)
+TOTAL       :     0.579438 sec
+     1,570,726,943      cycles                           #    2.694 GHz                       
+     3,243,232,441      instructions                     #    2.06  insn per cycle            
+       0.583677287 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3021) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.508777e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.767060e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.767060e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.172484e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.386570e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.386570e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.494747 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,383,177,469      cycles                           #    2.773 GHz                    
-     3,064,481,068      instructions                     #    2.22  insn per cycle         
-       0.499446571 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2749) (512y:  104) (512z:    0)
+TOTAL       :     0.544496 sec
+     1,490,247,847      cycles                           #    2.718 GHz                       
+     3,118,276,131      instructions                     #    2.09  insn per cycle            
+       0.548976134 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2827) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.351157e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.462501e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.462501e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.208001e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.313270e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.313270e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.725065 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,357,891,290      cycles                           #    1.863 GHz                    
-     2,000,455,329      instructions                     #    1.47  insn per cycle         
-       0.729577819 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1379) (512y:  106) (512z: 2218)
+TOTAL       :     0.771513 sec
+     1,385,006,024      cycles                           #    1.787 GHz                       
+     2,018,418,785      instructions                     #    1.46  insn per cycle            
+       0.775891856 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1439) (512y:   84) (512z: 2209)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
index 67eac99bab..c96c0f2bba 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:06:04
+DATE: 2025-10-11_15:27:47
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.866343e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.840904e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.947003e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.222648e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.903995e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.118782e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.463809 sec
-INFO: No Floating Point Exceptions have been reported
-     1,942,418,108      cycles                           #    2.861 GHz                    
-     2,721,411,859      instructions                     #    1.40  insn per cycle         
-       0.812650633 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.464819 sec
+     2,030,821,916      cycles                           #    2.839 GHz                       
+     2,744,793,219      instructions                     #    1.35  insn per cycle            
+       0.772863650 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.997280e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.399599e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.603946e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.790256e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.896792e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.070548e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.538885 sec
-INFO: No Floating Point Exceptions have been reported
-     2,239,160,610      cycles                           #    2.873 GHz                    
-     3,203,384,758      instructions                     #    1.43  insn per cycle         
-       0.836856412 seconds time elapsed
+TOTAL       :     0.539655 sec
+     2,316,213,602      cycles                           #    2.850 GHz                       
+     3,194,995,847      instructions                     #    1.38  insn per cycle            
+       0.870686173 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424749e-01
-Avg ME (F77/GPU)   = 0.14247482467490466
-Relative difference = 5.286902838873106e-07
+Avg ME (F77/GPU)   = 0.14247482467490463
+Relative difference = 5.286902840821208e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.060643e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.083213e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.083213e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.036091e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.058176e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.058176e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.565121 sec
-INFO: No Floating Point Exceptions have been reported
-     4,623,795,988      cycles                           #    2.948 GHz                    
-    13,181,888,102      instructions                     #    2.85  insn per cycle         
-       1.571833324 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  692) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.601117 sec
+     4,614,781,714      cycles                           #    2.877 GHz                       
+    13,227,683,016      instructions                     #    2.87  insn per cycle            
+       1.605070443 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  679) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499481
 Relative difference = 5.286896511435107e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.878003e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.949625e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.949625e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.832083e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.900484e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.900484e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.892385 sec
-INFO: No Floating Point Exceptions have been reported
-     2,641,116,720      cycles                           #    2.947 GHz                    
-     7,555,506,374      instructions                     #    2.86  insn per cycle         
-       0.899472366 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3093) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.913405 sec
+     2,666,905,925      cycles                           #    2.909 GHz                       
+     7,595,681,340      instructions                     #    2.85  insn per cycle            
+       0.917462386 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3077) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499475
 Relative difference = 5.286896515331313e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.178148e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.383095e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.383095e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.997059e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.186796e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.186796e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.535311 sec
-INFO: No Floating Point Exceptions have been reported
-     1,491,222,481      cycles                           #    2.767 GHz                    
-     3,161,019,864      instructions                     #    2.12  insn per cycle         
-       0.541387025 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2976) (512y:    0) (512z:    0)
+TOTAL       :     0.566232 sec
+     1,532,545,982      cycles                           #    2.690 GHz                       
+     3,190,811,369      instructions                     #    2.08  insn per cycle            
+       0.570104783 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3005) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.523592e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.778898e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.778898e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.138120e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.345703e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.345703e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.485060 sec
-INFO: No Floating Point Exceptions have been reported
-     1,349,314,232      cycles                           #    2.763 GHz                    
-     3,012,812,614      instructions                     #    2.23  insn per cycle         
-       0.489068736 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2726) (512y:  104) (512z:    0)
+TOTAL       :     0.542027 sec
+     1,447,882,232      cycles                           #    2.655 GHz                       
+     3,062,649,899      instructions                     #    2.12  insn per cycle            
+       0.545967207 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2804) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.347943e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.459729e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.459729e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.226133e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.328099e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.328099e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.720112 sec
-INFO: No Floating Point Exceptions have been reported
-     1,326,103,986      cycles                           #    1.833 GHz                    
-     1,962,664,460      instructions                     #    1.48  insn per cycle         
-       0.726078775 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1356) (512y:  106) (512z: 2218)
+TOTAL       :     0.757778 sec
+     1,343,211,600      cycles                           #    1.765 GHz                       
+     1,978,672,810      instructions                     #    1.47  insn per cycle            
+       0.761787399 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1416) (512y:   84) (512z: 2209)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..8a82307bae
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+DATE: 2025-10-11_15:44:45
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.527045e+06    1 256
+3.131556e+06    2 256
+6.093388e+06    4 256
+1.251780e+07    8 256
+2.244630e+07   16 256
+4.178995e+07   32 256
+6.592442e+07   64 256
+7.658956e+07  128 256
+8.216021e+07  256 256
+8.838611e+07  512 256
+9.244041e+07 1024 256
+### GPU: scaling test 32
+1.864346e+05    1  32
+3.981461e+05    2  32
+7.916041e+05    4  32
+1.446352e+06    8  32
+2.861310e+06   16  32
+6.255536e+06   32  32
+1.192410e+07   64  32
+2.215132e+07  128  32
+4.236701e+07  256  32
+6.877647e+07  512  32
+7.973525e+07 1024  32
+8.551740e+07 2048  32
+9.532558e+07 4096  32
+9.914765e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.054964e+05    1 256
+1.086764e+05    2 256
+1.085879e+05    4 256
+### CPU: scaling test 32
+9.631447e+04    1  32
+1.042281e+05    2  32
+1.016890e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.679848e+05    1 256
+2.830096e+05    2 256
+2.920388e+05    4 256
+### CPU: scaling test 32
+2.003030e+05    1  32
+2.733186e+05    2  32
+2.733314e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.015207e+05    1 256
+5.639568e+05    2 256
+5.644473e+05    4 256
+### CPU: scaling test 32
+5.530113e+05    1  32
+5.540310e+05    2  32
+6.104453e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.318601e+05    1 256
+5.672087e+05    2 256
+5.418454e+05    4 256
+### CPU: scaling test 32
+4.569666e+05    1  32
+5.422212e+05    2  32
+5.271481e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.266468e+05    1 256
+4.319869e+05    2 256
+4.643166e+05    4 256
+### CPU: scaling test 32
+4.562174e+05    1  32
+4.628927e+05    2  32
+4.441638e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index fa95ebd131..3c2f832038 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:06:45
+DATE: 2025-10-11_15:28:49
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.818001e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.982501e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.122889e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.775185e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.659813e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.119856e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.452282 sec
-INFO: No Floating Point Exceptions have been reported
-     1,920,727,034      cycles                           #    2.860 GHz                    
-     2,694,517,558      instructions                     #    1.40  insn per cycle         
-       0.728408510 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 169
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.460990 sec
+     2,032,870,493      cycles                           #    2.841 GHz                       
+     2,757,410,394      instructions                     #    1.36  insn per cycle            
+       0.774218584 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 161
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.287877e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.320334e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.683236e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.197057e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.828077e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.174418e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.571360e+02 +- 2.114020e+02 )  GeV^-2
-TOTAL       :     0.495314 sec
-INFO: No Floating Point Exceptions have been reported
-     2,079,539,950      cycles                           #    2.850 GHz                    
-     2,952,237,418      instructions                     #    1.42  insn per cycle         
-       0.786339466 seconds time elapsed
+TOTAL       :     0.492525 sec
+     2,151,242,968      cycles                           #    2.846 GHz                       
+     2,972,332,872      instructions                     #    1.38  insn per cycle            
+       0.812892837 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424226e-01
-Avg ME (F77/GPU)   = 0.14247487904286338
-Relative difference = 0.0003670698531228044
+Avg ME (F77/GPU)   = 0.14247487171431850
+Relative difference = 0.0003670183967887531
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.109567e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.134660e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.134660e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.088774e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.113486e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.113486e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.494759 sec
-INFO: No Floating Point Exceptions have been reported
-     4,403,081,916      cycles                           #    2.940 GHz                    
-    12,951,948,710      instructions                     #    2.94  insn per cycle         
-       1.498420981 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  645) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.523041 sec
+     4,438,181,728      cycles                           #    2.908 GHz                       
+    12,997,899,281      instructions                     #    2.93  insn per cycle            
+       1.526979824 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  651) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
 Avg ME (F77/C++)    = 0.14246861273719524
 Relative difference = 8.940352641194861e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.886806e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.066754e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.066754e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.813324e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.986491e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.986491e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     0.584675 sec
-INFO: No Floating Point Exceptions have been reported
-     1,726,276,919      cycles                           #    2.937 GHz                    
-     4,542,407,737      instructions                     #    2.63  insn per cycle         
-       0.588476135 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3627) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.599748 sec
+     1,741,244,369      cycles                           #    2.889 GHz                       
+     4,565,155,972      instructions                     #    2.62  insn per cycle            
+       0.603721432 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
 Avg ME (F77/C++)    = 0.14246862329122401
 Relative difference = 1.6348320966878032e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.651382e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.346145e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.346145e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.470584e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.128186e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.128186e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.307816 sec
-INFO: No Floating Point Exceptions have been reported
-       856,647,676      cycles                           #    2.754 GHz                    
-     1,917,830,464      instructions                     #    2.24  insn per cycle         
-       0.311794908 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3580) (512y:    0) (512z:    0)
+TOTAL       :     0.317328 sec
+       874,197,910      cycles                           #    2.725 GHz                       
+     1,937,671,895      instructions                     #    2.22  insn per cycle            
+       0.321309948 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3608) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.083995e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.890169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.890169e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.732936e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.453145e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.453145e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.287118 sec
-INFO: No Floating Point Exceptions have been reported
-       801,284,784      cycles                           #    2.760 GHz                    
-     1,834,043,941      instructions                     #    2.29  insn per cycle         
-       0.290894624 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3400) (512y:   22) (512z:    0)
+TOTAL       :     0.303630 sec
+       837,570,844      cycles                           #    2.728 GHz                       
+     1,865,428,267      instructions                     #    2.23  insn per cycle            
+       0.307759201 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3485) (512y:    2) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.500723e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.948038e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.948038e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.363450e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.779212e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.779212e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018829e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.384030 sec
-INFO: No Floating Point Exceptions have been reported
-       726,928,592      cycles                           #    1.877 GHz                    
-     1,308,660,654      instructions                     #    1.80  insn per cycle         
-       0.387900268 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1964) (512y:   24) (512z: 2435)
+TOTAL       :     0.396164 sec
+       743,365,153      cycles                           #    1.861 GHz                       
+     1,320,595,546      instructions                     #    1.78  insn per cycle            
+       0.400174159 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2032) (512y:    2) (512z: 2428)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491576758442
 Relative difference = 1.1066920862943416e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
index 5a6a874489..3158a41f16 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,272 +10,231 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:39:24
+DATE: 2025-10-11_16:31:01
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.958276e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.362856e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.362856e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.164266e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.164377e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.164377e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.017654e+01 +- 1.429183e+01 )  GeV^-2
-TOTAL       :     0.467586 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,958,269,299      cycles                           #    2.868 GHz                    
-     2,873,921,299      instructions                     #    1.47  insn per cycle         
-       0.741370031 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+TOTAL       :     0.466915 sec
+     2,002,533,494      cycles                           #    2.818 GHz                       
+     2,846,516,929      instructions                     #    1.42  insn per cycle            
+       0.767921314 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 161
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 169
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.867040e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.953002e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.953002e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.935448e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.962699e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.962699e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.609941e+02 +- 2.115589e+02 )  GeV^-2
-TOTAL       :     0.638465 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,513,600,406      cycles                           #    2.877 GHz                    
-     3,810,036,638      instructions                     #    1.52  insn per cycle         
-       0.930171723 seconds time elapsed
+TOTAL       :     0.638881 sec
+     2,551,134,973      cycles                           #    2.829 GHz                       
+     3,814,025,702      instructions                     #    1.50  insn per cycle            
+       0.960291968 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424226e-01
-Avg ME (F77/GPU)   = 0.14247487904286338
-Relative difference = 0.0003670698531228044
+Avg ME (F77/GPU)   = 0.14247487171431850
+Relative difference = 0.0003670183967887531
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.115307e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.140507e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.140507e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.072670e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.097133e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.097133e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.490082 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,418,597,373      cycles                           #    2.958 GHz                    
-    12,956,387,401      instructions                     #    2.93  insn per cycle         
-       1.494530314 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  645) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.549724 sec
+     4,455,261,943      cycles                           #    2.869 GHz                       
+    13,001,491,970      instructions                     #    2.92  insn per cycle            
+       1.553804785 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  651) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
 Avg ME (F77/C++)    = 0.14246861273719524
 Relative difference = 8.940352641194861e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.871197e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.051268e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.051268e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.775020e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.950077e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.950077e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     0.592243 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,749,393,716      cycles                           #    2.936 GHz                    
-     4,590,457,409      instructions                     #    2.62  insn per cycle         
-       0.596762261 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3627) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.612678 sec
+     1,763,964,947      cycles                           #    2.863 GHz                       
+     4,612,364,671      instructions                     #    2.61  insn per cycle            
+       0.616741606 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
 Avg ME (F77/C++)    = 0.14246862329122401
 Relative difference = 1.6348320966878032e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.650062e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.340176e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.340176e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.406265e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.059656e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.059656e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.311783 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-       875,769,738      cycles                           #    2.776 GHz                    
-     1,954,803,706      instructions                     #    2.23  insn per cycle         
-       0.316080972 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3580) (512y:    0) (512z:    0)
+TOTAL       :     0.325484 sec
+       894,227,621      cycles                           #    2.718 GHz                       
+     1,973,650,274      instructions                     #    2.21  insn per cycle            
+       0.329612707 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3608) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.042794e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.845843e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.845843e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.495052e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.198837e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.198837e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.293361 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-       825,335,769      cycles                           #    2.779 GHz                    
-     1,870,845,111      instructions                     #    2.27  insn per cycle         
-       0.297556229 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3400) (512y:   22) (512z:    0)
+TOTAL       :     0.321201 sec
+       866,167,930      cycles                           #    2.668 GHz                       
+     1,901,550,421      instructions                     #    2.20  insn per cycle            
+       0.325340653 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3485) (512y:    2) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.484934e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.935540e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.935540e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.189669e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.585230e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.585230e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018829e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.390040 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-       749,752,693      cycles                           #    1.904 GHz                    
-     1,350,296,093      instructions                     #    1.80  insn per cycle         
-       0.394449871 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1964) (512y:   24) (512z: 2435)
+TOTAL       :     0.417280 sec
+       768,093,760      cycles                           #    1.825 GHz                       
+     1,361,032,349      instructions                     #    1.77  insn per cycle            
+       0.423250195 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2032) (512y:    2) (512z: 2428)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491576758442
 Relative difference = 1.1066920862943416e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
index cea07bf7e8..8874a06c98 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:06:58
+DATE: 2025-10-11_15:29:09
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.801672e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.945717e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.092440e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.726166e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.668422e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.110300e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.453252 sec
-INFO: No Floating Point Exceptions have been reported
-     1,914,636,683      cycles                           #    2.859 GHz                    
-     2,699,162,883      instructions                     #    1.41  insn per cycle         
-       0.727606605 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 169
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.456732 sec
+     1,986,727,615      cycles                           #    2.822 GHz                       
+     2,734,105,162      instructions                     #    1.38  insn per cycle            
+       0.761604044 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 163
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.322683e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.438723e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.801307e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.139451e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.748092e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.065888e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.571360e+02 +- 2.114020e+02 )  GeV^-2
-TOTAL       :     0.493317 sec
-INFO: No Floating Point Exceptions have been reported
-     2,100,361,107      cycles                           #    2.862 GHz                    
-     2,955,351,040      instructions                     #    1.41  insn per cycle         
-       0.791031778 seconds time elapsed
+TOTAL       :     0.491750 sec
+     2,144,083,987      cycles                           #    2.843 GHz                       
+     2,965,934,309      instructions                     #    1.38  insn per cycle            
+       0.811495819 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424226e-01
-Avg ME (F77/GPU)   = 0.14247487904286338
-Relative difference = 0.0003670698531228044
+Avg ME (F77/GPU)   = 0.14247487171431850
+Relative difference = 0.0003670183967887531
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.112466e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.138003e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.138003e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.088510e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.113295e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.113295e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.490381 sec
-INFO: No Floating Point Exceptions have been reported
-     4,405,341,411      cycles                           #    2.950 GHz                    
-    12,928,117,316      instructions                     #    2.93  insn per cycle         
-       1.494164072 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.523573 sec
+     4,436,604,782      cycles                           #    2.906 GHz                       
+    12,976,159,794      instructions                     #    2.92  insn per cycle            
+       1.527521775 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  635) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
 Avg ME (F77/C++)    = 0.14246861273719524
 Relative difference = 8.940352641194861e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.897278e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.076728e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.076728e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.835028e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.015163e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.015163e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     0.582482 sec
-INFO: No Floating Point Exceptions have been reported
-     1,724,294,786      cycles                           #    2.945 GHz                    
-     4,536,655,836      instructions                     #    2.63  insn per cycle         
-       0.586223274 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3611) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.596717 sec
+     1,741,466,538      cycles                           #    2.902 GHz                       
+     4,559,733,587      instructions                     #    2.62  insn per cycle            
+       0.600733453 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3592) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
 Avg ME (F77/C++)    = 0.14246862329122401
 Relative difference = 1.6348320966878032e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.690817e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.397497e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.397497e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.380055e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.028758e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.028758e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.305315 sec
-INFO: No Floating Point Exceptions have been reported
-       857,155,838      cycles                           #    2.779 GHz                    
-     1,914,615,212      instructions                     #    2.23  insn per cycle         
-       0.309003061 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3549) (512y:    0) (512z:    0)
+TOTAL       :     0.322659 sec
+       877,270,879      cycles                           #    2.691 GHz                       
+     1,934,809,792      instructions                     #    2.21  insn per cycle            
+       0.326541378 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3579) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.056800e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.870570e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.870570e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.601915e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.305503e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.305503e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.288177 sec
-INFO: No Floating Point Exceptions have been reported
-       804,254,194      cycles                           #    2.761 GHz                    
-     1,829,977,116      instructions                     #    2.28  insn per cycle         
-       0.291930002 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3364) (512y:   22) (512z:    0)
+TOTAL       :     0.310801 sec
+       841,602,182      cycles                           #    2.678 GHz                       
+     1,861,524,675      instructions                     #    2.21  insn per cycle            
+       0.314890210 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3449) (512y:    2) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.550897e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.994144e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.994144e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.229370e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.636992e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.636992e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018829e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.380837 sec
-INFO: No Floating Point Exceptions have been reported
-       727,485,601      cycles                           #    1.894 GHz                    
-     1,306,171,995      instructions                     #    1.80  insn per cycle         
-       0.384559776 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1928) (512y:   24) (512z: 2435)
+TOTAL       :     0.407631 sec
+       742,675,842      cycles                           #    1.807 GHz                       
+     1,318,218,015      instructions                     #    1.77  insn per cycle            
+       0.411673396 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1996) (512y:    2) (512z: 2428)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491576758442
 Relative difference = 1.1066920862943416e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..86c9b7a546
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+DATE: 2025-10-11_15:44:24
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.435943e+06    1 256
+3.007907e+06    2 256
+5.634857e+06    4 256
+1.139868e+07    8 256
+2.191875e+07   16 256
+3.261770e+07   32 256
+3.913775e+07   64 256
+4.321439e+07  128 256
+4.782407e+07  256 256
+5.013042e+07  512 256
+5.117203e+07 1024 256
+### GPU: scaling test 32
+1.833223e+05    1  32
+3.625426e+05    2  32
+7.314829e+05    4  32
+1.459646e+06    8  32
+2.859760e+06   16  32
+5.667384e+06   32  32
+1.106459e+07   64  32
+2.218503e+07  128  32
+3.531887e+07  256  32
+3.896073e+07  512  32
+4.341558e+07 1024  32
+4.714542e+07 2048  32
+4.934308e+07 4096  32
+4.999316e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.008880e+05    1 256
+1.037575e+05    2 256
+1.026899e+05    4 256
+### CPU: scaling test 32
+8.543860e+04    1  32
+9.559401e+04    2  32
+9.690869e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.755069e+05    1 256
+1.824668e+05    2 256
+1.862361e+05    4 256
+### CPU: scaling test 32
+1.737091e+05    1  32
+1.676543e+05    2  32
+1.681730e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.270964e+05    1 256
+3.057259e+05    2 256
+3.141285e+05    4 256
+### CPU: scaling test 32
+2.994544e+05    1  32
+3.090295e+05    2  32
+3.346475e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.254054e+05    1 256
+3.252183e+05    2 256
+3.259569e+05    4 256
+### CPU: scaling test 32
+3.498874e+05    1  32
+3.542076e+05    2  32
+3.198481e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.243613e+05    1 256
+2.351291e+05    2 256
+2.345114e+05    4 256
+### CPU: scaling test 32
+2.301860e+05    1  32
+2.329857e+05    2  32
+2.104986e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index cb0b82e9a4..d3f2e68af7 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,226 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:06:18
+DATE: 2025-10-11_15:28:08
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.883484e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.876597e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.990293e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.235119e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.971049e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.180643e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.458412 sec
-INFO: No Floating Point Exceptions have been reported
-     1,935,066,146      cycles                           #    2.866 GHz                    
-     2,699,989,812      instructions                     #    1.40  insn per cycle         
-       0.733387527 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.464283 sec
+     2,023,320,904      cycles                           #    2.839 GHz                       
+     2,773,493,223      instructions                     #    1.37  insn per cycle            
+       0.771475737 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 38
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.013974e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.497451e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.709351e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.827739e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.997089e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.176442e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.541801 sec
-INFO: No Floating Point Exceptions have been reported
-     2,287,504,645      cycles                           #    2.883 GHz                    
-     3,220,826,671      instructions                     #    1.41  insn per cycle         
-       0.850636557 seconds time elapsed
+TOTAL       :     0.537726 sec
+     2,282,885,717      cycles                           #    2.817 GHz                       
+     3,160,756,797      instructions                     #    1.38  insn per cycle            
+       0.868903156 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424749e-01
-Avg ME (F77/GPU)   = 0.14247482577104625
-Relative difference = 5.209967070245855e-07
+Avg ME (F77/GPU)   = 0.14247482419639743
+Relative difference = 5.320488209618161e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.050634e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.073472e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073472e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.042873e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.065099e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.065099e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.580404 sec
-INFO: No Floating Point Exceptions have been reported
-     4,643,189,098      cycles                           #    2.932 GHz                    
-    13,180,741,468      instructions                     #    2.84  insn per cycle         
-       1.584505840 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  681) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.591072 sec
+     4,638,115,400      cycles                           #    2.909 GHz                       
+    13,236,410,026      instructions                     #    2.85  insn per cycle            
+       1.595277597 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  691) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482734618697
 Relative difference = 5.099411406595165e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.871761e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.941517e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.941517e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.832450e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.902450e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.902450e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.895197 sec
-INFO: No Floating Point Exceptions have been reported
-     2,647,990,030      cycles                           #    2.947 GHz                    
-     7,474,565,418      instructions                     #    2.82  insn per cycle         
-       0.899253220 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3152) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.913352 sec
+     2,653,863,508      cycles                           #    2.895 GHz                       
+     7,455,424,096      instructions                     #    2.81  insn per cycle            
+       0.917427770 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3062) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482734618697
-Relative difference = 5.099411406595165e-07
+Avg ME (F77/C++)    = 0.14247482733329694
+Relative difference = 5.100316128927506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.201825e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.415489e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.415489e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.117188e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.318909e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.318909e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.532048 sec
-INFO: No Floating Point Exceptions have been reported
-     1,472,019,476      cycles                           #    2.748 GHz                    
-     3,129,064,583      instructions                     #    2.13  insn per cycle         
-       0.536341858 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3133) (512y:    0) (512z:    0)
+TOTAL       :     0.545094 sec
+     1,478,675,993      cycles                           #    2.696 GHz                       
+     3,118,440,007      instructions                     #    2.11  insn per cycle            
+       0.549086981 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3060) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.569463e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.831852e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.831852e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.250725e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.471460e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.471460e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.479328 sec
-INFO: No Floating Point Exceptions have been reported
-     1,320,483,901      cycles                           #    2.736 GHz                    
-     2,983,197,107      instructions                     #    2.26  insn per cycle         
-       0.483280271 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2895) (512y:  110) (512z:    0)
+TOTAL       :     0.523896 sec
+     1,401,490,342      cycles                           #    2.658 GHz                       
+     2,993,266,123      instructions                     #    2.14  insn per cycle            
+       0.527885129 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2873) (512y:   90) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.268192e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.372574e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.372574e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.231374e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.335386e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.335386e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.745303 sec
-INFO: No Floating Point Exceptions have been reported
-     1,365,795,021      cycles                           #    1.824 GHz                    
-     1,991,870,632      instructions                     #    1.46  insn per cycle         
-       0.749335143 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1679) (512y:  108) (512z: 2251)
+TOTAL       :     0.756616 sec
+     1,324,382,086      cycles                           #    1.743 GHz                       
+     1,938,261,257      instructions                     #    1.46  insn per cycle            
+       0.760681799 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1363) (512y:   70) (512z: 2196)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
index 222758fe32..7ec5b5c818 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,226 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:06:32
+DATE: 2025-10-11_15:28:30
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.879429e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.807541e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.902111e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.256105e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.967576e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.174354e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.462530 sec
-INFO: No Floating Point Exceptions have been reported
-     1,930,179,746      cycles                           #    2.847 GHz                    
-     2,724,788,037      instructions                     #    1.41  insn per cycle         
-       0.736957830 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.463340 sec
+     2,028,215,818      cycles                           #    2.846 GHz                       
+     2,776,961,604      instructions                     #    1.37  insn per cycle            
+       0.769909609 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 38
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.958663e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.373563e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.576283e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.777604e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.905810e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.079424e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.543078 sec
-INFO: No Floating Point Exceptions have been reported
-     2,226,045,922      cycles                           #    2.831 GHz                    
-     3,151,460,121      instructions                     #    1.42  insn per cycle         
-       0.843097781 seconds time elapsed
+TOTAL       :     0.537813 sec
+     2,311,546,315      cycles                           #    2.847 GHz                       
+     3,204,384,721      instructions                     #    1.39  insn per cycle            
+       0.869430768 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424749e-01
-Avg ME (F77/GPU)   = 0.14247482577104625
-Relative difference = 5.209967070245855e-07
+Avg ME (F77/GPU)   = 0.14247482419639743
+Relative difference = 5.320488209618161e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.049471e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.072251e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.072251e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.027944e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.049964e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.049964e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.581795 sec
-INFO: No Floating Point Exceptions have been reported
-     4,647,850,638      cycles                           #    2.932 GHz                    
-    13,168,659,581      instructions                     #    2.83  insn per cycle         
-       1.585735048 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  666) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.613580 sec
+     4,641,772,345      cycles                           #    2.871 GHz                       
+    13,214,748,096      instructions                     #    2.85  insn per cycle            
+       1.617579626 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  679) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482734618697
 Relative difference = 5.099411406595165e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.863863e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.934907e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.934907e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.824575e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.893158e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.893158e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.898950 sec
-INFO: No Floating Point Exceptions have been reported
-     2,647,565,316      cycles                           #    2.935 GHz                    
-     7,477,127,209      instructions                     #    2.82  insn per cycle         
-       0.902852166 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3141) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.916995 sec
+     2,647,231,235      cycles                           #    2.877 GHz                       
+     7,451,993,603      instructions                     #    2.82  insn per cycle            
+       0.920907127 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3057) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482734618697
-Relative difference = 5.099411406595165e-07
+Avg ME (F77/C++)    = 0.14247482733329694
+Relative difference = 5.100316128927506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.193877e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.403471e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.403471e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.116778e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.320418e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.320418e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.533092 sec
-INFO: No Floating Point Exceptions have been reported
-     1,474,101,191      cycles                           #    2.747 GHz                    
-     3,129,731,788      instructions                     #    2.12  insn per cycle         
-       0.537323582 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3111) (512y:    0) (512z:    0)
+TOTAL       :     0.545336 sec
+     1,472,587,180      cycles                           #    2.683 GHz                       
+     3,116,400,718      instructions                     #    2.12  insn per cycle            
+       0.549340783 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3043) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.595782e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.860984e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.860984e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.223699e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.443094e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.443094e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.475643 sec
-INFO: No Floating Point Exceptions have been reported
-     1,319,166,719      cycles                           #    2.754 GHz                    
-     2,983,572,989      instructions                     #    2.26  insn per cycle         
-       0.479589426 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2871) (512y:  110) (512z:    0)
+TOTAL       :     0.528265 sec
+     1,399,996,992      cycles                           #    2.634 GHz                       
+     2,990,999,773      instructions                     #    2.14  insn per cycle            
+       0.532237029 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2854) (512y:   90) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.265955e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.372021e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.372021e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.302312e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.410857e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.410857e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.745301 sec
-INFO: No Floating Point Exceptions have been reported
-     1,365,993,831      cycles                           #    1.825 GHz                    
-     1,991,757,917      instructions                     #    1.46  insn per cycle         
-       0.749395729 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1655) (512y:  108) (512z: 2251)
+TOTAL       :     0.733431 sec
+     1,324,620,583      cycles                           #    1.798 GHz                       
+     1,936,852,170      instructions                     #    1.46  insn per cycle            
+       0.737506511 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1344) (512y:   70) (512z: 2196)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
index 9b3f75797b..14462fa0eb 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:01:13
+DATE: 2025-10-11_17:04:42
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.147069e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.778623e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.394888e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.654485e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.404459e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.690060e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.535117 sec
-INFO: No Floating Point Exceptions have been reported
-     2,222,375,781      cycles                           #    2.890 GHz                    
-     3,181,150,200      instructions                     #    1.43  insn per cycle         
-       0.828824866 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 228
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.541401 sec
+     2,305,332,177      cycles                           #    2.847 GHz                       
+     3,197,913,952      instructions                     #    1.39  insn per cycle            
+       0.868100814 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313472e+00
 Avg ME (F77/GPU)   = 4.3134710926110280
 Relative difference = 2.1036162329561614e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.628496e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.666122e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.666122e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.571130e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.606300e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.606300e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.554723 sec
-INFO: No Floating Point Exceptions have been reported
-    19,293,957,259      cycles                           #    2.941 GHz                    
-    51,936,518,995      instructions                     #    2.69  insn per cycle         
-       6.561734499 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.786947 sec
+    19,519,870,393      cycles                           #    2.875 GHz                       
+    52,258,888,975      instructions                     #    2.68  insn per cycle            
+       6.792671431 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  655) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926105795
 Relative difference = 2.1036172727915933e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.914767e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.044981e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.044981e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.857187e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.984563e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.984563e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.713846 sec
-INFO: No Floating Point Exceptions have been reported
-    10,942,394,234      cycles                           #    2.942 GHz                    
-    30,809,451,561      instructions                     #    2.82  insn per cycle         
-       3.720459537 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2915) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.780938 sec
+    10,994,068,173      cycles                           #    2.904 GHz                       
+    30,917,710,259      instructions                     #    2.81  insn per cycle            
+       3.786765562 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2895) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926105795
 Relative difference = 2.1036172727915933e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.701521e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.038587e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.038587e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.468427e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.776131e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.776131e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.349075 sec
-INFO: No Floating Point Exceptions have been reported
-     6,518,044,155      cycles                           #    2.767 GHz                    
-    13,691,830,614      instructions                     #    2.10  insn per cycle         
-       2.356266703 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2941) (512y:    0) (512z:    0)
+TOTAL       :     2.458667 sec
+     6,708,728,258      cycles                           #    2.723 GHz                       
+    13,712,517,378      instructions                     #    2.04  insn per cycle            
+       2.464482201 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2936) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.169544e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.582169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.582169e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.847459e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.209715e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.209715e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.151067 sec
-INFO: No Floating Point Exceptions have been reported
-     5,973,431,908      cycles                           #    2.768 GHz                    
-    13,032,735,919      instructions                     #    2.18  insn per cycle         
-       2.158817844 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2667) (512y:  146) (512z:    0)
+TOTAL       :     2.275732 sec
+     6,180,724,079      cycles                           #    2.710 GHz                       
+    13,193,237,105      instructions                     #    2.13  insn per cycle            
+       2.281442783 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2714) (512y:  126) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.442417e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.620453e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.620453e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.203485e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.355713e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.355713e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.166223 sec
-INFO: No Floating Point Exceptions have been reported
-     5,879,580,303      cycles                           #    1.853 GHz                    
-     8,614,888,302      instructions                     #    1.47  insn per cycle         
-       3.173636028 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1506) (512y:  128) (512z: 1946)
+TOTAL       :     3.384877 sec
+     5,997,535,040      cycles                           #    1.769 GHz                       
+     8,705,216,175      instructions                     #    1.45  insn per cycle            
+       3.390523516 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1546) (512y:  106) (512z: 1954)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt
index fe94934cb0..c1b909362e 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:01:39
+DATE: 2025-10-11_17:05:16
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.155696e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.811430e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.416776e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.602305e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.299861e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.572992e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.531667 sec
-INFO: No Floating Point Exceptions have been reported
-     2,222,115,079      cycles                           #    2.893 GHz                    
-     3,196,008,298      instructions                     #    1.44  insn per cycle         
-       0.825144177 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 216
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.543522 sec
+     2,289,271,142      cycles                           #    2.817 GHz                       
+     3,205,208,831      instructions                     #    1.40  insn per cycle            
+       0.870293269 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313472e+00
 Avg ME (F77/GPU)   = 4.3134710926110280
 Relative difference = 2.1036162329561614e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.710634e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.751435e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.751435e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.653039e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.691951e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.691951e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.246482 sec
-INFO: No Floating Point Exceptions have been reported
-    18,390,828,933      cycles                           #    2.942 GHz                    
-    50,070,723,541      instructions                     #    2.72  insn per cycle         
-       6.253313848 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  626) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.455303 sec
+    18,685,885,377      cycles                           #    2.893 GHz                       
+    50,237,697,539      instructions                     #    2.69  insn per cycle            
+       6.460495783 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  611) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926105795
 Relative difference = 2.1036172727915933e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.069031e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.214398e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.214398e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.954178e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.091326e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.091326e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.535906 sec
-INFO: No Floating Point Exceptions have been reported
-    10,415,008,507      cycles                           #    2.940 GHz                    
-    29,198,189,749      instructions                     #    2.80  insn per cycle         
-       3.543300262 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2733) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.661921 sec
+    10,461,474,208      cycles                           #    2.853 GHz                       
+    29,320,644,078      instructions                     #    2.80  insn per cycle            
+       3.667913174 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2712) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926105795
 Relative difference = 2.1036172727915933e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.327920e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.613203e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.613203e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.223646e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.500682e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.500682e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.541514 sec
-INFO: No Floating Point Exceptions have been reported
-     7,032,477,509      cycles                           #    2.760 GHz                    
-    15,175,173,386      instructions                     #    2.16  insn per cycle         
-       2.548867076 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3020) (512y:    0) (512z:    0)
+TOTAL       :     2.594203 sec
+     6,988,437,642      cycles                           #    2.689 GHz                       
+    15,195,785,073      instructions                     #    2.17  insn per cycle            
+       2.599980482 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3011) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.529226e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.840126e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.840126e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.417064e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.714981e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.714981e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.433970 sec
-INFO: No Floating Point Exceptions have been reported
-     6,732,593,285      cycles                           #    2.759 GHz                    
-    14,647,151,783      instructions                     #    2.18  insn per cycle         
-       2.441354685 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2621) (512y:  302) (512z:    0)
+TOTAL       :     2.485778 sec
+     6,715,707,590      cycles                           #    2.696 GHz                       
+    14,680,064,315      instructions                     #    2.19  insn per cycle            
+       2.491527768 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2612) (512y:  302) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.326729e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.490201e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.490201e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.163644e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.312325e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.312325e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.271504 sec
-INFO: No Floating Point Exceptions have been reported
-     6,070,928,941      cycles                           #    1.852 GHz                    
-    10,360,391,243      instructions                     #    1.71  insn per cycle         
-       3.278977914 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1268) (512y:  214) (512z: 2129)
+TOTAL       :     3.425924 sec
+     6,178,650,952      cycles                           #    1.801 GHz                       
+    10,506,622,006      instructions                     #    1.70  insn per cycle            
+       3.431763355 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1317) (512y:  216) (512z: 2136)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
index 8cd2c74f38..32d858512c 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,242 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:03:02
+DATE: 2025-10-11_17:06:56
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.465620e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.510965e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.608079e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.746430e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.525187e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.618301e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 7.154219e+00 +- 1.620281e-01 )  GeV^0
-TOTAL       :     0.492110 sec
-INFO: No Floating Point Exceptions have been reported
-     2,084,727,455      cycles                           #    2.877 GHz                    
-     2,955,736,176      instructions                     #    1.42  insn per cycle         
-       0.784112386 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 131
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.494982 sec
+     2,135,489,785      cycles                           #    2.833 GHz                       
+     2,986,554,714      instructions                     #    1.40  insn per cycle            
+       0.812364995 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 99
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 24
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313490e+00
-Avg ME (F77/GPU)   = 4.3136695491848513
-Relative difference = 4.162503792787837e-05
+Avg ME (F77/GPU)   = 4.3136695760767907
+Relative difference = 4.1631272308702715e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.686557e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.727704e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.727704e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.639930e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.679722e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.679722e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175644e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     6.305463 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    18,635,302,225      cycles                           #    2.953 GHz                    
-    51,219,407,083      instructions                     #    2.75  insn per cycle         
-       6.310992251 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.483754 sec
+    18,765,516,643      cycles                           #    2.893 GHz                       
+    51,374,423,413      instructions                     #    2.74  insn per cycle            
+       6.489228485 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  623) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313574e+00
 Avg ME (F77/C++)    = 4.3135738277342170
 Relative difference = 3.9935743068669333e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.043062e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.307407e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.307407e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.904149e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.155838e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.155838e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175642e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     2.681205 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     7,932,523,130      cycles                           #    2.953 GHz                    
-    19,317,767,787      instructions                     #    2.44  insn per cycle         
-       2.686665617 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3542) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.775203 sec
+     8,009,571,813      cycles                           #    2.881 GHz                       
+    19,418,906,078      instructions                     #    2.42  insn per cycle            
+       2.780526828 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3524) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313572e+00
 Avg ME (F77/C++)    = 4.3135722697479650
 Relative difference = 6.253470796314402e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.901471e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.926003e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.926003e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.670886e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.626596e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.626596e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     1.413719 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     3,953,020,388      cycles                           #    2.786 GHz                    
-     8,832,668,299      instructions                     #    2.23  insn per cycle         
-       1.419629254 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3715) (512y:    0) (512z:    0)
+TOTAL       :     1.456000 sec
+     3,972,178,441      cycles                           #    2.719 GHz                       
+     8,869,239,722      instructions                     #    2.23  insn per cycle            
+       1.461741307 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3709) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313565e+00
-Avg ME (F77/C++)    = 4.3135645242873579
-Relative difference = 1.1028294269894893e-07
+Avg ME (F77/C++)    = 4.3135645270813257
+Relative difference = 1.096352260831459e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.392997e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.544307e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.544307e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.928240e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.948874e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.948874e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     1.337803 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     3,735,491,375      cycles                           #    2.782 GHz                    
-     8,430,906,889      instructions                     #    2.26  insn per cycle         
-       1.343508069 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3541) (512y:   20) (512z:    0)
+TOTAL       :     1.411952 sec
+     3,818,419,324      cycles                           #    2.695 GHz                       
+     8,547,519,956      instructions                     #    2.24  insn per cycle            
+       1.417398798 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3594) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313565e+00
-Avg ME (F77/C++)    = 4.3135645242873579
-Relative difference = 1.1028294269894893e-07
+Avg ME (F77/C++)    = 4.3135645270813257
+Relative difference = 1.096352260831459e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.024352e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.578236e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.578236e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.574912e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.065441e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.065441e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     1.827995 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     3,508,723,607      cycles                           #    1.915 GHz                    
-     6,244,798,669      instructions                     #    1.78  insn per cycle         
-       1.833521857 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2325) (512y:   22) (512z: 2290)
+TOTAL       :     1.971243 sec
+     3,626,432,325      cycles                           #    1.835 GHz                       
+     6,319,513,510      instructions                     #    1.74  insn per cycle            
+       1.976911767 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2377) (512y:    0) (512z: 2299)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313564e+00
-Avg ME (F77/C++)    = 4.3135643536224961
-Relative difference = 8.197919301304478e-08
+Avg ME (F77/C++)    = 4.3135642320849001
+Relative difference = 5.380351369373482e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt
index 1ff1d26090..218c8378c2 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,246 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:03:24
+DATE: 2025-10-11_17:07:25
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.690902e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.615208e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.727767e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.779658e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.535884e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.628235e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 7.154219e+00 +- 1.620281e-01 )  GeV^0
-TOTAL       :     0.493976 sec
-INFO: No Floating Point Exceptions have been reported
-     2,066,790,877      cycles                           #    2.843 GHz                    
-     2,969,404,210      instructions                     #    1.44  insn per cycle         
-       0.785535997 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 125
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.493747 sec
+     2,136,570,540      cycles                           #    2.832 GHz                       
+     2,955,252,814      instructions                     #    1.38  insn per cycle            
+       0.811353108 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 100
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 24
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313490e+00
-Avg ME (F77/GPU)   = 4.3136695491848513
-Relative difference = 4.162503792787837e-05
+Avg ME (F77/GPU)   = 4.3136695760767907
+Relative difference = 4.1631272308702715e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.736131e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.779781e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.779781e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.693969e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.736524e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.736524e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175644e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     6.127979 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    18,032,140,147      cycles                           #    2.940 GHz                    
-    49,602,643,371      instructions                     #    2.75  insn per cycle         
-       6.133935412 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  613) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.279316 sec
+    18,165,491,134      cycles                           #    2.891 GHz                       
+    49,676,906,698      instructions                     #    2.73  insn per cycle            
+       6.284692119 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  607) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313574e+00
 Avg ME (F77/C++)    = 4.3135738277342170
 Relative difference = 3.9935743068669333e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.506367e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.839198e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.839198e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.443862e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.778187e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.778187e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175642e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     2.414203 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     7,115,995,603      cycles                           #    2.942 GHz                    
-    18,533,869,751      instructions                     #    2.60  insn per cycle         
-       2.419892180 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3252) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.449024 sec
+     7,084,328,481      cycles                           #    2.887 GHz                       
+    18,582,770,693      instructions                     #    2.62  insn per cycle            
+       2.454447463 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3222) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313572e+00
 Avg ME (F77/C++)    = 4.3135722697479650
 Relative difference = 6.253470796314402e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.374488e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.825683e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.825683e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.216367e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.641236e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.641236e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     2.037733 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     5,644,200,229      cycles                           #    2.763 GHz                    
-    10,848,148,808      instructions                     #    1.92  insn per cycle         
-       2.043741542 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4274) (512y:    0) (512z:    0)
+TOTAL       :     2.098866 sec
+     5,652,855,011      cycles                           #    2.688 GHz                       
+    10,909,770,006      instructions                     #    1.93  insn per cycle            
+       2.104181652 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4283) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313565e+00
-Avg ME (F77/C++)    = 4.3135645242873579
-Relative difference = 1.1028294269894893e-07
+Avg ME (F77/C++)    = 4.3135645270813257
+Relative difference = 1.096352260831459e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.433283e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.894901e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.894901e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.314509e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.753400e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.753400e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     2.017462 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     5,594,464,289      cycles                           #    2.767 GHz                    
-    10,554,918,385      instructions                     #    1.89  insn per cycle         
-       2.022782231 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4138) (512y:   12) (512z:    0)
+TOTAL       :     2.062043 sec
+     5,590,274,103      cycles                           #    2.706 GHz                       
+    10,617,976,090      instructions                     #    1.90  insn per cycle            
+       2.067292425 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4142) (512y:   13) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313565e+00
-Avg ME (F77/C++)    = 4.3135645242873579
-Relative difference = 1.1028294269894893e-07
+Avg ME (F77/C++)    = 4.3135645270813257
+Relative difference = 1.096352260831459e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.364066e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.648223e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.648223e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.151626e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.412256e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.412256e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     2.491143 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     4,639,687,839      cycles                           #    1.859 GHz                    
-     8,661,216,579      instructions                     #    1.87  insn per cycle         
-       2.496647539 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2799) (512y:    0) (512z: 2885)
+TOTAL       :     2.614832 sec
+     4,741,117,769      cycles                           #    1.810 GHz                       
+     8,743,372,129      instructions                     #    1.84  insn per cycle            
+       2.620465706 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2850) (512y:    0) (512z: 2889)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313564e+00
-Avg ME (F77/C++)    = 4.3135643536224961
-Relative difference = 8.197919301304478e-08
+Avg ME (F77/C++)    = 4.3135642320849001
+Relative difference = 5.380351369373482e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
index 12c9da87af..f4ff8c446a 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,246 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:02:06
+DATE: 2025-10-11_17:05:47
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.131914e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.755854e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.359452e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.626534e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.403274e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.688448e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.535565 sec
-INFO: No Floating Point Exceptions have been reported
-     2,204,224,001      cycles                           #    2.864 GHz                    
-     3,121,247,303      instructions                     #    1.42  insn per cycle         
-       0.828499405 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 228
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.543452 sec
+     2,301,166,740      cycles                           #    2.836 GHz                       
+     3,210,334,164      instructions                     #    1.40  insn per cycle            
+       0.870784678 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313472e+00
-Avg ME (F77/GPU)   = 4.3134711012809239
-Relative difference = 2.0835166567625394e-07
+Avg ME (F77/GPU)   = 4.3134712619343958
+Relative difference = 1.711070812999077e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.529079e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.561968e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.561968e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.489645e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.521138e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.521138e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.973239 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    20,550,139,482      cycles                           #    2.945 GHz                    
-    51,941,635,065      instructions                     #    2.53  insn per cycle         
-       6.980082779 seconds time elapsed
+TOTAL       :     7.151635 sec
+    20,539,261,330      cycles                           #    2.870 GHz                       
+    52,312,072,955      instructions                     #    2.55  insn per cycle            
+       7.157317940 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  655) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134711778082178
-Relative difference = 1.906102050071626e-07
+Avg ME (F77/C++)    = 4.3134711782756741
+Relative difference = 1.9050183377028104e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.672019e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.782339e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.782339e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.635024e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.743558e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.743558e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     4.043433 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    11,521,778,322      cycles                           #    2.845 GHz                    
-    30,615,090,868      instructions                     #    2.66  insn per cycle         
-       4.050715703 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2972) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.091108 sec
+    11,568,480,565      cycles                           #    2.825 GHz                       
+    30,592,470,506      instructions                     #    2.64  insn per cycle            
+       4.096724147 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134711778082178
-Relative difference = 1.906102050071626e-07
+Avg ME (F77/C++)    = 4.3134711778081822
+Relative difference = 1.9061021324348284e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.474164e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.781347e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.781347e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.442158e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.748594e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.748594e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.469295 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,758,530,167      cycles                           #    2.729 GHz                    
-    13,653,357,404      instructions                     #    2.02  insn per cycle         
-       2.477625143 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3118) (512y:    0) (512z:    0)
+TOTAL       :     2.473093 sec
+     6,663,246,815      cycles                           #    2.689 GHz                       
+    13,582,195,938      instructions                     #    2.04  insn per cycle            
+       2.478977008 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3085) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.946193e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.312777e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.312777e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.658370e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.993226e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.993226e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.239110 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,206,585,291      cycles                           #    2.765 GHz                    
-    13,005,835,459      instructions                     #    2.10  insn per cycle         
-       2.246664710 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2851) (512y:  150) (512z:    0)
+TOTAL       :     2.362618 sec
+     6,353,039,315      cycles                           #    2.684 GHz                       
+    13,072,016,547      instructions                     #    2.06  insn per cycle            
+       2.368607155 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2867) (512y:  130) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.130780e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.276017e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.276017e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.116355e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.262209e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.262209e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.470623 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,429,525,372      cycles                           #    1.849 GHz                    
-     8,729,822,669      instructions                     #    1.36  insn per cycle         
-       3.478318009 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1792) (512y:  130) (512z: 2014)
+TOTAL       :     3.476875 sec
+     6,216,987,973      cycles                           #    1.786 GHz                       
+     8,426,779,606      instructions                     #    1.36  insn per cycle            
+       3.483074770 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1598) (512y:   96) (512z: 1978)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt
index 90c964242c..f78a78f7e9 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,246 +10,213 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:02:34
+DATE: 2025-10-11_17:06:21
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.143359e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.817002e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.430401e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.581022e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.292223e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.567393e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.533031 sec
-INFO: No Floating Point Exceptions have been reported
-     2,222,154,822      cycles                           #    2.885 GHz                    
-     3,215,427,054      instructions                     #    1.45  insn per cycle         
-       0.826924367 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 216
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.541711 sec
+     2,303,336,148      cycles                           #    2.840 GHz                       
+     3,222,227,466      instructions                     #    1.40  insn per cycle            
+       0.868265701 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313472e+00
-Avg ME (F77/GPU)   = 4.3134711012809239
-Relative difference = 2.0835166567625394e-07
+Avg ME (F77/GPU)   = 4.3134712619343958
+Relative difference = 1.711070812999077e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.616471e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.652773e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.652773e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.563907e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.598575e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.598575e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.603326 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    19,494,406,226      cycles                           #    2.950 GHz                    
-    49,966,413,800      instructions                     #    2.56  insn per cycle         
-       6.609959024 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  599) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.817167 sec
+    19,709,237,083      cycles                           #    2.890 GHz                       
+    50,290,409,188      instructions                     #    2.55  insn per cycle            
+       6.822753554 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  611) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134711778082178
-Relative difference = 1.906102050071626e-07
+Avg ME (F77/C++)    = 4.3134711782756741
+Relative difference = 1.9050183377028104e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.890177e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.018164e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.018164e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.841525e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.969254e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.969254e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.745798 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    11,068,643,232      cycles                           #    2.950 GHz                    
-    29,164,471,893      instructions                     #    2.63  insn per cycle         
-       3.753005329 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2815) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.802477 sec
+    11,003,460,648      cycles                           #    2.890 GHz                       
+    29,103,019,269      instructions                     #    2.64  insn per cycle            
+       3.808301655 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2766) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134711778082178
-Relative difference = 1.906102050071626e-07
+Avg ME (F77/C++)    = 4.3134711778081822
+Relative difference = 1.9061021324348284e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.744994e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.955254e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.955254e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.769392e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.987989e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.987989e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.917714 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     8,087,123,435      cycles                           #    2.766 GHz                    
-    15,210,355,188      instructions                     #    1.88  insn per cycle         
-       2.924634632 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3203) (512y:    0) (512z:    0)
+TOTAL       :     2.893528 sec
+     7,880,875,441      cycles                           #    2.719 GHz                       
+    15,079,012,118      instructions                     #    1.91  insn per cycle            
+       2.899352011 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3163) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.909194e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.140218e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.140218e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.967773e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.208568e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.208568e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.798673 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     7,730,347,780      cycles                           #    2.756 GHz                    
-    14,498,978,915      instructions                     #    1.88  insn per cycle         
-       2.805768338 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2775) (512y:  304) (512z:    0)
+TOTAL       :     2.753936 sec
+     7,508,856,368      cycles                           #    2.722 GHz                       
+    14,417,603,283      instructions                     #    1.92  insn per cycle            
+       2.759752652 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2737) (512y:  304) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.049249e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.186111e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.186111e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.068489e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.209462e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.209462e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.561293 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,578,699,260      cycles                           #    1.844 GHz                    
-     9,927,155,424      instructions                     #    1.51  insn per cycle         
-       3.569129809 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1565) (512y:  216) (512z: 2216)
+TOTAL       :     3.528645 sec
+     6,308,539,404      cycles                           #    1.786 GHz                       
+     9,645,872,961      instructions                     #    1.53  insn per cycle            
+       3.534370742 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1371) (512y:  204) (512z: 2172)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
index 2b34ea67ad..b64bd08c6e 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:00:07
+DATE: 2025-10-11_17:02:19
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.760509e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.779507e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.782702e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.749715e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.123100e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.185595e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.473450 sec
-INFO: No Floating Point Exceptions have been reported
-     1,994,326,240      cycles                           #    2.874 GHz                    
-     2,845,102,706      instructions                     #    1.43  insn per cycle         
-       0.753810347 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.460632 sec
+     2,016,310,298      cycles                           #    2.828 GHz                       
+     2,811,062,777      instructions                     #    1.39  insn per cycle            
+       0.771405460 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.019067e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.126130e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.133988e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.798297e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.902790e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.910598e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.490494 sec
-INFO: No Floating Point Exceptions have been reported
-     2,031,600,016      cycles                           #    2.857 GHz                    
-     2,995,319,726      instructions                     #    1.47  insn per cycle         
-       0.772627668 seconds time elapsed
+TOTAL       :     0.483683 sec
+     2,080,405,450      cycles                           #    2.828 GHz                       
+     2,919,633,235      instructions                     #    1.40  insn per cycle            
+       0.795243442 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127459e-06
 Avg ME (F77/GPU)   = 8.1274562860176604E-006
 Relative difference = 3.3392753366481633e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.383469e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.386752e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.386752e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.386932e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.390193e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.390193e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.158264 sec
-INFO: No Floating Point Exceptions have been reported
-       469,342,334      cycles                           #    2.906 GHz                    
-     1,390,298,076      instructions                     #    2.96  insn per cycle         
-       0.162106230 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3908) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.158198 sec
+       459,847,306      cycles                           #    2.852 GHz                       
+     1,381,276,044      instructions                     #    3.00  insn per cycle            
+       0.161817794 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860167185E-006
 Relative difference = 3.339276495559746e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.476358e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.488167e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.488167e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.255945e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.267065e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.267065e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.083612 sec
-INFO: No Floating Point Exceptions have been reported
-       240,584,825      cycles                           #    2.769 GHz                    
-       693,113,903      instructions                     #    2.88  insn per cycle         
-       0.087424946 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9482) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.086223 sec
+       240,474,211      cycles                           #    2.695 GHz                       
+       691,658,857      instructions                     #    2.88  insn per cycle            
+       0.089852973 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 9332) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860167168E-006
 Relative difference = 3.3392764976441195e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.432068e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.438681e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.438681e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.385213e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.390914e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.390914e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.038906 sec
-INFO: No Floating Point Exceptions have been reported
-       114,140,366      cycles                           #    2.711 GHz                    
-       257,891,266      instructions                     #    2.26  insn per cycle         
-       0.042661267 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8501) (512y:    0) (512z:    0)
+TOTAL       :     0.040134 sec
+       114,132,005      cycles                           #    2.644 GHz                       
+       258,038,380      instructions                     #    2.26  insn per cycle            
+       0.043763583 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8583) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.618386e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.625883e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.625883e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.538966e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.546528e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.546528e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.034671 sec
-INFO: No Floating Point Exceptions have been reported
-       102,555,024      cycles                           #    2.705 GHz                    
-       240,017,026      instructions                     #    2.34  insn per cycle         
-       0.038425016 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8143) (512y:  150) (512z:    0)
+TOTAL       :     0.036228 sec
+       103,692,755      cycles                           #    2.641 GHz                       
+       240,622,200      instructions                     #    2.32  insn per cycle            
+       0.039728552 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8271) (512y:  130) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.192893e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.198052e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.198052e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.148417e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.153199e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.153199e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.046494 sec
-INFO: No Floating Point Exceptions have been reported
-        90,048,800      cycles                           #    1.806 GHz                    
-       134,302,710      instructions                     #    1.49  insn per cycle         
-       0.050438224 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1943) (512y:  126) (512z: 7086)
+TOTAL       :     0.048211 sec
+        90,387,142      cycles                           #    1.755 GHz                       
+       134,612,621      instructions                     #    1.49  insn per cycle            
+       0.052002771 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2130) (512y:  104) (512z: 7074)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt
index dc41fe503f..4db43dd255 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:00:18
+DATE: 2025-10-11_17:02:42
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.797107e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.816023e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.819423e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.803202e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.181220e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.245341e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.469966 sec
-INFO: No Floating Point Exceptions have been reported
-     2,001,057,465      cycles                           #    2.881 GHz                    
-     2,930,552,926      instructions                     #    1.46  insn per cycle         
-       0.752195966 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.458543 sec
+     2,011,139,566      cycles                           #    2.825 GHz                       
+     2,801,263,226      instructions                     #    1.39  insn per cycle            
+       0.769027350 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.121137e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.233030e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.241027e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.788680e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.895418e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.902637e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.489610 sec
-INFO: No Floating Point Exceptions have been reported
-     2,050,200,483      cycles                           #    2.873 GHz                    
-     3,056,241,818      instructions                     #    1.49  insn per cycle         
-       0.771808178 seconds time elapsed
+TOTAL       :     0.483711 sec
+     2,072,169,922      cycles                           #    2.815 GHz                       
+     2,948,772,929      instructions                     #    1.42  insn per cycle            
+       0.795276590 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127459e-06
 Avg ME (F77/GPU)   = 8.1274562860176604E-006
 Relative difference = 3.3392753366481633e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.406266e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.409565e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.409565e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.383885e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.387148e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.387148e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.156466 sec
-INFO: No Floating Point Exceptions have been reported
-       465,689,745      cycles                           #    2.917 GHz                    
-     1,385,079,930      instructions                     #    2.97  insn per cycle         
-       0.160315659 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3796) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.157412 sec
+       457,302,712      cycles                           #    2.851 GHz                       
+     1,376,801,855      instructions                     #    3.01  insn per cycle            
+       0.160964317 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860167185E-006
 Relative difference = 3.339276495559746e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.388983e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.401822e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.401822e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.288759e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.301116e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.301116e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.083799 sec
-INFO: No Floating Point Exceptions have been reported
-       238,961,924      cycles                           #    2.745 GHz                    
-       689,073,758      instructions                     #    2.88  insn per cycle         
-       0.087593094 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9525) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.085024 sec
+       238,495,422      cycles                           #    2.707 GHz                       
+       687,028,266      instructions                     #    2.88  insn per cycle            
+       0.088746242 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 9384) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860167168E-006
 Relative difference = 3.3392764976441195e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.419818e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.425419e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.425419e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.395926e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.401596e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.401596e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.038479 sec
-INFO: No Floating Point Exceptions have been reported
-       111,800,811      cycles                           #    2.682 GHz                    
-       253,484,287      instructions                     #    2.27  insn per cycle         
-       0.042138594 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8457) (512y:    0) (512z:    0)
+TOTAL       :     0.039010 sec
+       112,073,428      cycles                           #    2.662 GHz                       
+       253,139,110      instructions                     #    2.26  insn per cycle            
+       0.042677736 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8538) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.620452e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.628839e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.628839e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.525855e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.532589e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.532589e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.033872 sec
-INFO: No Floating Point Exceptions have been reported
-       100,998,379      cycles                           #    2.706 GHz                    
-       235,641,730      instructions                     #    2.33  insn per cycle         
-       0.037957581 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8101) (512y:  150) (512z:    0)
+TOTAL       :     0.035869 sec
+       101,601,884      cycles                           #    2.611 GHz                       
+       235,894,497      instructions                     #    2.32  insn per cycle            
+       0.039518260 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8224) (512y:  130) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.156678e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.161477e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.161477e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.142399e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.147704e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.147704e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.047111 sec
-INFO: No Floating Point Exceptions have been reported
-        88,066,978      cycles                           #    1.743 GHz                    
-       129,735,533      instructions                     #    1.47  insn per cycle         
-       0.051105123 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1899) (512y:  126) (512z: 7084)
+TOTAL       :     0.047633 sec
+        88,136,356      cycles                           #    1.737 GHz                       
+       129,828,247      instructions                     #    1.47  insn per cycle            
+       0.051419113 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2084) (512y:  104) (512z: 7074)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
index 4b10dcf1d1..5211bad1d2 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:00:52
+DATE: 2025-10-11_17:03:51
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.214342e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.224285e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.226222e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.302427e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.704300e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.791284e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.188141e-04 +- 6.565202e-04 )  GeV^-4
-TOTAL       :     0.476842 sec
-INFO: No Floating Point Exceptions have been reported
-     1,989,613,876      cycles                           #    2.873 GHz                    
-     2,928,089,356      instructions                     #    1.47  insn per cycle         
-       0.750924959 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.462607 sec
+     2,015,593,801      cycles                           #    2.836 GHz                       
+     2,784,970,796      instructions                     #    1.38  insn per cycle            
+       0.770212174 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.950242e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.029144e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.036217e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 8.020494e-03 +- 4.025605e-03 )  GeV^-4
-TOTAL       :     0.473909 sec
-INFO: No Floating Point Exceptions have been reported
-     1,995,145,721      cycles                           #    2.886 GHz                    
-     2,912,342,089      instructions                     #    1.46  insn per cycle         
-       0.748274226 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 1.169898e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.187942e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.190235e+06                 )  sec^-1
+MeanMatrixElemValue         = ( 8.020495e-03 +- 4.025605e-03 )  GeV^-4
+TOTAL       :     0.469557 sec
+     2,042,790,873      cycles                           #    2.836 GHz                       
+     2,884,156,824      instructions                     #    1.41  insn per cycle            
+       0.777382571 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127250e-06
-Avg ME (F77/GPU)   = 8.1272869669930272E-006
-Relative difference = 4.548524165778887e-06
+Avg ME (F77/GPU)   = 8.1272869086972111E-006
+Relative difference = 4.541351282443064e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.462777e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.466245e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.466245e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.579211e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.582825e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.582825e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.177153e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.154509 sec
-INFO: No Floating Point Exceptions have been reported
-       463,950,135      cycles                           #    2.942 GHz                    
-     1,382,102,782      instructions                     #    2.98  insn per cycle         
-       0.158280886 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3058) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.149618 sec
+       441,460,345      cycles                           #    2.891 GHz                       
+     1,357,431,891      instructions                     #    3.07  insn per cycle            
+       0.153196109 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1503) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127811e-06
-Avg ME (F77/C++)    = 8.1278105271212486E-006
-Relative difference = 5.8180333155894157e-08
+Avg ME (F77/C++)    = 8.1278105256181649E-006
+Relative difference = 5.836526409016727e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.221716e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.226773e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.226773e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.178631e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.183684e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.183684e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.177152e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.045148 sec
-INFO: No Floating Point Exceptions have been reported
-       132,927,826      cycles                           #    2.743 GHz                    
-       372,156,154      instructions                     #    2.80  insn per cycle         
-       0.049041087 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:10141) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.046713 sec
+       133,037,126      cycles                           #    2.662 GHz                       
+       371,430,035      instructions                     #    2.79  insn per cycle            
+       0.050453436 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 9988) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127809e-06
 Avg ME (F77/C++)    = 8.1278090510674588E-006
 Relative difference = 6.2830535070193674e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.776220e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.801025e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.801025e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.599910e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.621223e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.621223e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.021005 sec
-INFO: No Floating Point Exceptions have been reported
-        65,153,242      cycles                           #    2.690 GHz                    
-       142,838,093      instructions                     #    2.19  insn per cycle         
-       0.024771930 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9241) (512y:    0) (512z:    0)
+TOTAL       :     0.022499 sec
+        65,701,477      cycles                           #    2.576 GHz                       
+       142,904,938      instructions                     #    2.18  insn per cycle            
+       0.026069649 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9322) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
 Avg ME (F77/C++)    = 8.1275366216540664E-006
 Relative difference = 4.655111786058001e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.070417e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.098717e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.098717e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.684576e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.708888e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.708888e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.019184 sec
-INFO: No Floating Point Exceptions have been reported
-        60,296,621      cycles                           #    2.678 GHz                    
-       132,772,434      instructions                     #    2.20  insn per cycle         
-       0.023065155 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8959) (512y:   28) (512z:    0)
+TOTAL       :     0.021728 sec
+        60,421,247      cycles                           #    2.428 GHz                       
+       133,158,601      instructions                     #    2.20  insn per cycle            
+       0.025465207 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9093) (512y:    8) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
 Avg ME (F77/C++)    = 8.1275366216540664E-006
 Relative difference = 4.655111786058001e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.324469e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.345673e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.345673e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.239020e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.260813e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.260813e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165747e-04 +- 6.542824e-04 )  GeV^-4
-TOTAL       :     0.024875 sec
-INFO: No Floating Point Exceptions have been reported
-        52,411,208      cycles                           #    1.857 GHz                    
-        79,637,147      instructions                     #    1.52  insn per cycle         
-       0.028776798 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2836) (512y:   30) (512z: 7437)
+TOTAL       :     0.025827 sec
+        52,150,255      cycles                           #    1.790 GHz                       
+        79,743,681      instructions                     #    1.53  insn per cycle            
+       0.029792364 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3031) (512y:    8) (512z: 7424)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
 Avg ME (F77/C++)    = 8.1275369863475849E-006
 Relative difference = 1.6797726498700304e-09
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt
index 67a7328c67..c79acb423d 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:01:02
+DATE: 2025-10-11_17:04:20
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.235104e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.244507e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.246621e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.351614e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.802263e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.888038e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.188141e-04 +- 6.565202e-04 )  GeV^-4
-TOTAL       :     0.477845 sec
-INFO: No Floating Point Exceptions have been reported
-     1,997,911,903      cycles                           #    2.876 GHz                    
-     2,886,764,809      instructions                     #    1.44  insn per cycle         
-       0.753229194 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.458224 sec
+     1,995,767,929      cycles                           #    2.816 GHz                       
+     2,740,980,318      instructions                     #    1.37  insn per cycle            
+       0.766478985 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.096496e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.193422e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.206590e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.181811e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.198606e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.200307e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 8.020496e-03 +- 4.025606e-03 )  GeV^-4
-TOTAL       :     0.477844 sec
-INFO: No Floating Point Exceptions have been reported
-     2,000,227,335      cycles                           #    2.879 GHz                    
-     2,887,661,973      instructions                     #    1.44  insn per cycle         
-       0.753759254 seconds time elapsed
+TOTAL       :     0.469407 sec
+     2,020,295,671      cycles                           #    2.810 GHz                       
+     2,851,658,754      instructions                     #    1.41  insn per cycle            
+       0.776046944 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127250e-06
-Avg ME (F77/GPU)   = 8.1272866419447706E-006
-Relative difference = 4.508529302013153e-06
+Avg ME (F77/GPU)   = 8.1272867096445498E-006
+Relative difference = 4.516859275763117e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.435869e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.439325e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.439325e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 7.177153e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.154994 sec
-INFO: No Floating Point Exceptions have been reported
-       461,652,768      cycles                           #    2.918 GHz                    
-     1,376,807,565      instructions                     #    2.98  insn per cycle         
-       0.158786297 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2930) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 3.511421e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.515116e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.515116e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 7.177152e-04 +- 6.554185e-04 )  GeV^-4
+TOTAL       :     0.151755 sec
+       446,437,299      cycles                           #    2.884 GHz                       
+     1,359,153,558      instructions                     #    3.04  insn per cycle            
+       0.155354916 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1960) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127811e-06
-Avg ME (F77/C++)    = 8.1278105271212486E-006
-Relative difference = 5.8180333155894157e-08
+Avg ME (F77/C++)    = 8.1278105326147384E-006
+Relative difference = 5.7504445173550794e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.215601e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.220158e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.220158e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.180553e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.185062e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.185062e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.177152e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.044587 sec
-INFO: No Floating Point Exceptions have been reported
-       130,364,411      cycles                           #    2.725 GHz                    
-       367,274,419      instructions                     #    2.82  insn per cycle         
-       0.048380365 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:10124) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.045862 sec
+       130,422,574      cycles                           #    2.664 GHz                       
+       366,713,009      instructions                     #    2.81  insn per cycle            
+       0.049604747 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 9971) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127809e-06
 Avg ME (F77/C++)    = 8.1278090510674588E-006
 Relative difference = 6.2830535070193674e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.799777e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.825160e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.825160e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.692821e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.714744e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.714744e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.020201 sec
-INFO: No Floating Point Exceptions have been reported
-        63,211,215      cycles                           #    2.704 GHz                    
-       138,063,768      instructions                     #    2.18  insn per cycle         
-       0.023985955 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9196) (512y:    0) (512z:    0)
+TOTAL       :     0.020805 sec
+        63,132,535      cycles                           #    2.647 GHz                       
+       138,133,867      instructions                     #    2.19  insn per cycle            
+       0.024434416 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9272) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
 Avg ME (F77/C++)    = 8.1275366216540664E-006
 Relative difference = 4.655111786058001e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.035669e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.062918e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.062918e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.972359e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.000309e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.000309e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.018625 sec
-INFO: No Floating Point Exceptions have been reported
-        57,993,332      cycles                           #    2.658 GHz                    
-       127,990,808      instructions                     #    2.21  insn per cycle         
-       0.022353301 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8910) (512y:   28) (512z:    0)
+TOTAL       :     0.019005 sec
+        58,481,038      cycles                           #    2.633 GHz                       
+       128,386,986      instructions                     #    2.20  insn per cycle            
+       0.022679122 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9045) (512y:    8) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
 Avg ME (F77/C++)    = 8.1275366216540664E-006
 Relative difference = 4.655111786058001e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.344103e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.363443e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.363443e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.272413e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.292411e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.292411e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165747e-04 +- 6.542824e-04 )  GeV^-4
-TOTAL       :     0.024010 sec
-INFO: No Floating Point Exceptions have been reported
-        50,268,269      cycles                           #    1.840 GHz                    
-        74,785,740      instructions                     #    1.49  insn per cycle         
-       0.027917015 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2791) (512y:   30) (512z: 7439)
+TOTAL       :     0.024623 sec
+        50,322,119      cycles                           #    1.806 GHz                       
+        74,992,557      instructions                     #    1.49  insn per cycle            
+       0.028526790 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2983) (512y:    8) (512z: 7425)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
 Avg ME (F77/C++)    = 8.1275369863475849E-006
 Relative difference = 1.6797726498700304e-09
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
index 50cf2d796e..c43ff17d3c 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:00:29
+DATE: 2025-10-11_17:03:05
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.754018e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.771557e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.774637e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.763173e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.125938e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.192941e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.469482 sec
-INFO: No Floating Point Exceptions have been reported
-     1,992,256,665      cycles                           #    2.872 GHz                    
-     2,888,484,617      instructions                     #    1.45  insn per cycle         
-       0.750839241 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.458247 sec
+     2,022,321,141      cycles                           #    2.816 GHz                       
+     2,799,483,258      instructions                     #    1.38  insn per cycle            
+       0.774798224 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.962737e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.089994e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.098896e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.755571e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.866016e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.873910e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.485052 sec
-INFO: No Floating Point Exceptions have been reported
-     2,027,704,407      cycles                           #    2.871 GHz                    
-     3,029,735,278      instructions                     #    1.49  insn per cycle         
-       0.765353713 seconds time elapsed
+TOTAL       :     0.484676 sec
+     2,078,557,296      cycles                           #    2.829 GHz                       
+     2,897,976,393      instructions                     #    1.39  insn per cycle            
+       0.794258904 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127459e-06
-Avg ME (F77/GPU)   = 8.1274562879405200E-006
-Relative difference = 3.3369094561706885e-07
+Avg ME (F77/GPU)   = 8.1274562122604674E-006
+Relative difference = 3.4300259549904373e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.401289e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.404577e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.404577e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.388630e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.392004e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.392004e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.157429 sec
-INFO: No Floating Point Exceptions have been reported
-       471,621,611      cycles                           #    2.936 GHz                    
-     1,398,387,891      instructions                     #    2.97  insn per cycle         
-       0.161191989 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3899) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.157940 sec
+       464,903,592      cycles                           #    2.886 GHz                       
+     1,389,803,957      instructions                     #    2.99  insn per cycle            
+       0.161593391 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562948736117E-006
 Relative difference = 3.32837900190667e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.729709e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.743939e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.743939e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.572359e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.584503e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.584503e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.080446 sec
-INFO: No Floating Point Exceptions have been reported
-       237,178,815      cycles                           #    2.833 GHz                    
-       688,220,781      instructions                     #    2.90  insn per cycle         
-       0.084309693 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9334) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.082287 sec
+       236,914,725      cycles                           #    2.777 GHz                       
+       687,861,027      instructions                     #    2.90  insn per cycle            
+       0.085920826 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 9067) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563175290919E-006
 Relative difference = 3.3005037703909805e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.409119e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.415451e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.415451e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.419898e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.425632e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.425632e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.039507 sec
-INFO: No Floating Point Exceptions have been reported
-       114,068,471      cycles                           #    2.665 GHz                    
-       253,096,543      instructions                     #    2.22  insn per cycle         
-       0.043335126 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8363) (512y:    0) (512z:    0)
+TOTAL       :     0.039368 sec
+       113,570,815      cycles                           #    2.680 GHz                       
+       253,055,756      instructions                     #    2.23  insn per cycle            
+       0.042992839 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8121) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.680681e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.688641e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.688641e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.595281e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.602693e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.602693e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.033493 sec
-INFO: No Floating Point Exceptions have been reported
-       101,334,967      cycles                           #    2.753 GHz                    
-       233,610,113      instructions                     #    2.31  insn per cycle         
-       0.037380618 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 7501) (512y:  146) (512z:    0)
+TOTAL       :     0.035105 sec
+       102,173,670      cycles                           #    2.666 GHz                       
+       233,820,968      instructions                     #    2.29  insn per cycle            
+       0.038810282 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 7314) (512y:  126) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.194656e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.199944e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.199944e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.158210e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.163544e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.163544e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.046435 sec
-INFO: No Floating Point Exceptions have been reported
-        91,210,419      cycles                           #    1.827 GHz                    
-       133,172,431      instructions                     #    1.46  insn per cycle         
-       0.050429905 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2084) (512y:  122) (512z: 6354)
+TOTAL       :     0.047815 sec
+        89,915,156      cycles                           #    1.766 GHz                       
+       131,317,903      instructions                     #    1.46  insn per cycle            
+       0.051535880 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1995) (512y:  100) (512z: 6276)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt
index e1fc789bed..d6a9bd8585 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:00:41
+DATE: 2025-10-11_17:03:28
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.793622e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.811451e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.814397e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.669359e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.024328e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.088471e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.470592 sec
-INFO: No Floating Point Exceptions have been reported
-     1,997,502,547      cycles                           #    2.880 GHz                    
-     2,923,476,215      instructions                     #    1.46  insn per cycle         
-       0.750818094 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.459467 sec
+     2,006,632,193      cycles                           #    2.818 GHz                       
+     2,802,302,686      instructions                     #    1.40  insn per cycle            
+       0.769563513 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+.........................................................................
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.055830e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.165646e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.173712e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.797271e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.897088e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.904896e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.491474 sec
-INFO: No Floating Point Exceptions have been reported
-     2,044,918,526      cycles                           #    2.859 GHz                    
-     3,006,189,896      instructions                     #    1.47  insn per cycle         
-       0.774360899 seconds time elapsed
+TOTAL       :     0.485964 sec
+     2,085,949,128      cycles                           #    2.828 GHz                       
+     2,970,232,534      instructions                     #    1.42  insn per cycle            
+       0.796151358 seconds time elapsed
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127459e-06
-Avg ME (F77/GPU)   = 8.1274562879405200E-006
-Relative difference = 3.3369094561706885e-07
+Avg ME (F77/GPU)   = 8.1274562122604674E-006
+Relative difference = 3.4300259549904373e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.402707e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.406541e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.406541e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.393388e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.396682e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.396682e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.156609 sec
-INFO: No Floating Point Exceptions have been reported
-       468,766,259      cycles                           #    2.933 GHz                    
-     1,393,706,102      instructions                     #    2.97  insn per cycle         
-       0.160398151 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3800) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.156959 sec
+       461,726,786      cycles                           #    2.887 GHz                       
+     1,385,347,614      instructions                     #    3.00  insn per cycle            
+       0.160462326 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562948736117E-006
 Relative difference = 3.32837900190667e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.728046e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.740604e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.740604e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.599813e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.612219e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.612219e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.079699 sec
-INFO: No Floating Point Exceptions have been reported
-       235,148,851      cycles                           #    2.837 GHz                    
-       684,201,633      instructions                     #    2.91  insn per cycle         
-       0.083458032 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9368) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.081200 sec
+       234,522,151      cycles                           #    2.781 GHz                       
+       683,124,885      instructions                     #    2.91  insn per cycle            
+       0.084930246 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 9100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563175290919E-006
 Relative difference = 3.3005037703909805e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.447554e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.453499e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.453499e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.420930e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.426598e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.426598e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.037781 sec
-INFO: No Floating Point Exceptions have been reported
-       111,660,471      cycles                           #    2.716 GHz                    
-       248,651,696      instructions                     #    2.23  insn per cycle         
-       0.041691428 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8316) (512y:    0) (512z:    0)
+TOTAL       :     0.038386 sec
+       111,202,178      cycles                           #    2.675 GHz                       
+       248,277,259      instructions                     #    2.23  insn per cycle            
+       0.042154353 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8074) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.634149e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.641617e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.641617e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.570276e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.578064e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.578064e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.033571 sec
-INFO: No Floating Point Exceptions have been reported
-        99,219,938      cycles                           #    2.697 GHz                    
-       229,292,514      instructions                     #    2.31  insn per cycle         
-       0.037291206 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 7452) (512y:  146) (512z:    0)
+TOTAL       :     0.034958 sec
+       100,134,440      cycles                           #    2.632 GHz                       
+       229,125,035      instructions                     #    2.29  insn per cycle            
+       0.038647286 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 7265) (512y:  126) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.191988e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.196872e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.196872e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.164156e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.168925e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.168925e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.045809 sec
-INFO: No Floating Point Exceptions have been reported
-        88,834,257      cycles                           #    1.806 GHz                    
-       128,615,199      instructions                     #    1.45  insn per cycle         
-       0.049747357 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2035) (512y:  122) (512z: 6355)
+TOTAL       :     0.046899 sec
+        87,248,248      cycles                           #    1.750 GHz                       
+       126,582,829      instructions                     #    1.45  insn per cycle            
+       0.050568011 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1946) (512y:  100) (512z: 6276)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
index 107a77153b..0619b08e27 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:58:55
+DATE: 2025-10-11_17:00:50
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.910300e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.325267e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.783205e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.353699e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.078498e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.922999e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.523085 sec
-INFO: No Floating Point Exceptions have been reported
-     2,188,593,202      cycles                           #    2.883 GHz                    
-     3,112,954,096      instructions                     #    1.42  insn per cycle         
-       0.817031478 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 130
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.530539 sec
+     2,259,281,332      cycles                           #    2.839 GHz                       
+     3,100,637,501      instructions                     #    1.37  insn per cycle            
+       0.855479528 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477196e-01
-Avg ME (F77/GPU)   = 0.14771956172964262
-Relative difference = 2.590743366698123e-07
+Avg ME (F77/GPU)   = 0.14771956172964260
+Relative difference = 2.5907433685770594e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.066686e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.035589e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.035589e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.156775e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.205296e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.205296e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.278019 sec
-INFO: No Floating Point Exceptions have been reported
-     3,764,987,469      cycles                           #    2.931 GHz                    
-     9,752,169,319      instructions                     #    2.59  insn per cycle         
-       1.285199771 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  341) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.400705 sec
+     4,031,222,897      cycles                           #    2.869 GHz                       
+     9,715,380,409      instructions                     #    2.41  insn per cycle            
+       1.406286157 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  406) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.478889e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.890818e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.890818e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.450099e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.861491e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.861491e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.830635 sec
-INFO: No Floating Point Exceptions have been reported
-     2,356,582,684      cycles                           #    2.814 GHz                    
-     5,959,230,788      instructions                     #    2.53  insn per cycle         
-       0.838030934 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1369) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.838337 sec
+     2,350,240,123      cycles                           #    2.786 GHz                       
+     5,962,397,870      instructions                     #    2.54  insn per cycle            
+       0.844193677 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1351) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.229956e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.271002e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.271002e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.162719e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.161528e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.161528e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.594206 sec
-INFO: No Floating Point Exceptions have been reported
-     1,695,017,656      cycles                           #    2.820 GHz                    
-     3,345,002,918      instructions                     #    1.97  insn per cycle         
-       0.601755215 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1499) (512y:    0) (512z:    0)
+TOTAL       :     0.600854 sec
+     1,671,713,001      cycles                           #    2.758 GHz                       
+     3,319,973,297      instructions                     #    1.99  insn per cycle            
+       0.606663801 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1492) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.272289e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.349942e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.349942e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.261662e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.349890e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.349890e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.586413 sec
-INFO: No Floating Point Exceptions have been reported
-     1,670,913,790      cycles                           #    2.815 GHz                    
-     3,318,759,581      instructions                     #    1.99  insn per cycle         
-       0.594196558 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1375) (512y:   96) (512z:    0)
+TOTAL       :     0.577948 sec
+     1,617,041,581      cycles                           #    2.773 GHz                       
+     3,291,143,565      instructions                     #    2.04  insn per cycle            
+       0.583833732 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1367) (512y:   96) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.146635e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.068698e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.068698e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.100149e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.993172e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.993172e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.618076 sec
-INFO: No Floating Point Exceptions have been reported
-     1,426,424,228      cycles                           #    2.279 GHz                    
-     2,470,718,173      instructions                     #    1.73  insn per cycle         
-       0.626622796 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  580) (512y:   60) (512z: 1021)
+TOTAL       :     0.615039 sec
+     1,364,172,223      cycles                           #    2.200 GHz                       
+     2,429,556,714      instructions                     #    1.78  insn per cycle            
+       0.620861975 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  583) (512y:   60) (512z: 1009)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt
index 00276091a3..071e7697d0 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:59:08
+DATE: 2025-10-11_17:01:05
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.969963e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.427733e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.936447e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.417263e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.094810e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.959655e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.519852 sec
-INFO: No Floating Point Exceptions have been reported
-     2,172,307,830      cycles                           #    2.872 GHz                    
-     3,081,950,905      instructions                     #    1.42  insn per cycle         
-       0.813507263 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.525108 sec
+     2,234,624,938      cycles                           #    2.820 GHz                       
+     3,124,481,460      instructions                     #    1.40  insn per cycle            
+       0.850037014 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477196e-01
-Avg ME (F77/GPU)   = 0.14771956172964262
-Relative difference = 2.590743366698123e-07
+Avg ME (F77/GPU)   = 0.14771956172964260
+Relative difference = 2.5907433685770594e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.156288e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.045734e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.045734e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.289834e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.373214e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.373214e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.265578 sec
-INFO: No Floating Point Exceptions have been reported
-     3,747,828,201      cycles                           #    2.946 GHz                    
-     9,632,221,913      instructions                     #    2.57  insn per cycle         
-       1.272810702 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  359) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.378734 sec
+     3,995,674,296      cycles                           #    2.888 GHz                       
+     9,595,338,306      instructions                     #    2.40  insn per cycle            
+       1.384441945 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  401) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.494739e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.931280e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.931280e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.457938e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.874008e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.874008e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.827695 sec
-INFO: No Floating Point Exceptions have been reported
-     2,378,817,913      cycles                           #    2.850 GHz                    
-     5,912,991,474      instructions                     #    2.49  insn per cycle         
-       0.835517705 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1340) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.834586 sec
+     2,348,281,075      cycles                           #    2.796 GHz                       
+     5,903,694,010      instructions                     #    2.51  insn per cycle            
+       0.840556806 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1329) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.079942e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.957305e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.957305e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.178686e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.194593e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.194593e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.628333 sec
-INFO: No Floating Point Exceptions have been reported
-     1,788,933,654      cycles                           #    2.817 GHz                    
-     3,328,376,953      instructions                     #    1.86  insn per cycle         
-       0.635862534 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1436) (512y:    0) (512z:    0)
+TOTAL       :     0.595816 sec
+     1,665,750,464      cycles                           #    2.772 GHz                       
+     3,289,499,758      instructions                     #    1.97  insn per cycle            
+       0.601728408 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1437) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.320640e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.437091e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.437091e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.254319e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.335615e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.335615e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.574324 sec
-INFO: No Floating Point Exceptions have been reported
-     1,653,934,067      cycles                           #    2.845 GHz                    
-     3,291,054,827      instructions                     #    1.99  insn per cycle         
-       0.581926884 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1328) (512y:   96) (512z:    0)
+TOTAL       :     0.579487 sec
+     1,624,326,903      cycles                           #    2.777 GHz                       
+     3,265,891,511      instructions                     #    2.01  insn per cycle            
+       0.585419257 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1330) (512y:   96) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.152026e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.087565e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.087565e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.069886e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.953317e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.953317e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.611501 sec
-INFO: No Floating Point Exceptions have been reported
-     1,420,414,146      cycles                           #    2.296 GHz                    
-     2,439,626,449      instructions                     #    1.72  insn per cycle         
-       0.619276325 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  547) (512y:   60) (512z: 1007)
+TOTAL       :     0.621553 sec
+     1,373,190,892      cycles                           #    2.193 GHz                       
+     2,413,828,053      instructions                     #    1.76  insn per cycle            
+       0.627336488 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  550) (512y:   60) (512z: 1005)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
index bd2093b69b..6216dff6c8 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:59:44
+DATE: 2025-10-11_17:01:47
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.032605e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.087100e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.501992e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.174946e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.068173e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.272719e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486732e-01 +- 3.293572e-05 )  GeV^0
-TOTAL       :     0.487961 sec
-INFO: No Floating Point Exceptions have been reported
-     2,048,884,733      cycles                           #    2.866 GHz                    
-     2,915,076,407      instructions                     #    1.42  insn per cycle         
-       0.773529382 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 97
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.489126 sec
+     2,124,007,963      cycles                           #    2.815 GHz                       
+     2,945,321,471      instructions                     #    1.39  insn per cycle            
+       0.811539193 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477195e-01
-Avg ME (F77/GPU)   = 0.14771956735057756
-Relative difference = 4.559355911674916e-07
+Avg ME (F77/GPU)   = 0.14771956769982353
+Relative difference = 4.58299842099026e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.070270e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.039772e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.039772e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.779077e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.006315e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.006315e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     1.246364 sec
-INFO: No Floating Point Exceptions have been reported
-     3,688,263,957      cycles                           #    2.948 GHz                    
-     9,604,598,454      instructions                     #    2.60  insn per cycle         
-       1.251819600 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.286813 sec
+     3,697,266,650      cycles                           #    2.863 GHz                       
+     9,611,683,530      instructions                     #    2.60  insn per cycle            
+       1.292373810 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  465) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956094773486
 Relative difference = 2.643675256627469e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.214709e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.338045e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.338045e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.204438e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.350250e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.350250e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     0.563106 sec
-INFO: No Floating Point Exceptions have been reported
-     1,636,975,072      cycles                           #    2.881 GHz                    
-     3,967,404,939      instructions                     #    2.42  insn per cycle         
-       0.568812477 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1579) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.567715 sec
+     1,640,656,743      cycles                           #    2.864 GHz                       
+     3,979,080,194      instructions                     #    2.43  insn per cycle            
+       0.573454265 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1553) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955861942843
 Relative difference = 2.80129187869649e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.994371e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.295152e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.295152e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.953501e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.188885e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.188885e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.439594 sec
-INFO: No Floating Point Exceptions have been reported
-     1,256,321,725      cycles                           #    2.826 GHz                    
-     2,497,438,777      instructions                     #    1.99  insn per cycle         
-       0.445252542 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1924) (512y:    0) (512z:    0)
+TOTAL       :     0.446090 sec
+     1,257,376,904      cycles                           #    2.787 GHz                       
+     2,504,409,181      instructions                     #    1.99  insn per cycle            
+       0.451851006 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1915) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955698961392
 Relative difference = 2.9116235141448046e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.098864e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.632832e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.632832e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.026066e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.404220e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.404220e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.427898 sec
-INFO: No Floating Point Exceptions have been reported
-     1,236,536,318      cycles                           #    2.855 GHz                    
-     2,473,365,360      instructions                     #    2.00  insn per cycle         
-       0.433705293 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1870) (512y:    1) (512z:    0)
+TOTAL       :     0.438014 sec
+     1,235,323,979      cycles                           #    2.788 GHz                       
+     2,479,535,477      instructions                     #    2.01  insn per cycle            
+       0.443692621 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1861) (512y:    1) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955698961392
 Relative difference = 2.9116235141448046e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.931142e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.994223e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.994223e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.854396e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.809242e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.809242e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293561e-05 )  GeV^0
-TOTAL       :     0.448530 sec
-INFO: No Floating Point Exceptions have been reported
-     1,079,279,667      cycles                           #    2.379 GHz                    
-     2,073,684,661      instructions                     #    1.92  insn per cycle         
-       0.454351959 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1011) (512y:    5) (512z: 1292)
+TOTAL       :     0.460001 sec
+     1,078,883,681      cycles                           #    2.321 GHz                       
+     2,076,270,716      instructions                     #    1.92  insn per cycle            
+       0.465628674 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1014) (512y:    5) (512z: 1276)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955262403935
 Relative difference = 3.207154680524219e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt
index 2473496911..b9e5df5750 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:59:56
+DATE: 2025-10-11_17:02:06
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.032625e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.129649e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.575777e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.174766e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.032980e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.224739e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486732e-01 +- 3.293572e-05 )  GeV^0
-TOTAL       :     0.481858 sec
-INFO: No Floating Point Exceptions have been reported
-     2,051,512,664      cycles                           #    2.885 GHz                    
-     2,948,723,179      instructions                     #    1.44  insn per cycle         
-       0.768027645 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 86
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.489051 sec
+     2,148,781,052      cycles                           #    2.834 GHz                       
+     2,942,650,451      instructions                     #    1.37  insn per cycle            
+       0.815858067 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477195e-01
-Avg ME (F77/GPU)   = 0.14771956525510177
-Relative difference = 4.4175008557828484e-07
+Avg ME (F77/GPU)   = 0.14771956508047879
+Relative difference = 4.4056796011251757e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.212337e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.061006e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.061006e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.862221e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.017701e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.017701e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     1.227579 sec
-INFO: No Floating Point Exceptions have been reported
-     3,620,291,769      cycles                           #    2.937 GHz                    
-     9,471,544,557      instructions                     #    2.62  insn per cycle         
-       1.233302650 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  367) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.273068 sec
+     3,660,086,626      cycles                           #    2.864 GHz                       
+     9,502,319,452      instructions                     #    2.60  insn per cycle            
+       1.278709233 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  370) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956094773486
 Relative difference = 2.643675256627469e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.220343e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.350531e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.350531e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.092947e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.109735e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.109735e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     0.560958 sec
-INFO: No Floating Point Exceptions have been reported
-     1,637,220,191      cycles                           #    2.892 GHz                    
-     3,933,324,289      instructions                     #    2.40  insn per cycle         
-       0.566799529 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1517) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.591777 sec
+     1,671,501,463      cycles                           #    2.802 GHz                       
+     3,947,247,316      instructions                     #    2.36  insn per cycle            
+       0.597353565 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1510) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955861942843
 Relative difference = 2.80129187869649e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.995950e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.312007e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.312007e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.904335e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.013564e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.013564e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.438140 sec
-INFO: No Floating Point Exceptions have been reported
-     1,255,613,659      cycles                           #    2.833 GHz                    
-     2,482,092,959      instructions                     #    1.98  insn per cycle         
-       0.443764126 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1817) (512y:    0) (512z:    0)
+TOTAL       :     0.451671 sec
+     1,251,161,997      cycles                           #    2.741 GHz                       
+     2,488,699,975      instructions                     #    1.99  insn per cycle            
+       0.457155054 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1819) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955698961392
 Relative difference = 2.9116235141448046e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.087645e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.599722e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.599722e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.993855e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.299058e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.299058e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.428178 sec
-INFO: No Floating Point Exceptions have been reported
-     1,231,320,501      cycles                           #    2.843 GHz                    
-     2,457,271,461      instructions                     #    2.00  insn per cycle         
-       0.433769891 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1773) (512y:    1) (512z:    0)
+TOTAL       :     0.440947 sec
+     1,225,739,794      cycles                           #    2.746 GHz                       
+     2,464,639,586      instructions                     #    2.01  insn per cycle            
+       0.448602225 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1777) (512y:    1) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955698961392
 Relative difference = 2.9116235141448046e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.945345e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.024652e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.024652e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.880064e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.891083e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.891083e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293561e-05 )  GeV^0
-TOTAL       :     0.444653 sec
-INFO: No Floating Point Exceptions have been reported
-     1,073,447,692      cycles                           #    2.387 GHz                    
-     2,057,517,401      instructions                     #    1.92  insn per cycle         
-       0.450271011 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  906) (512y:    5) (512z: 1273)
+TOTAL       :     0.454521 sec
+     1,073,931,359      cycles                           #    2.337 GHz                       
+     2,059,749,623      instructions                     #    1.92  insn per cycle            
+       0.460150581 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  909) (512y:    5) (512z: 1267)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955262403935
 Relative difference = 3.207154680524219e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
index 5ae4907c26..5e30b14ca9 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:59:20
+DATE: 2025-10-11_17:01:19
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.059495e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.307970e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.770458e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.446721e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.093075e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.939789e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.521991 sec
-INFO: No Floating Point Exceptions have been reported
-     2,182,804,723      cycles                           #    2.882 GHz                    
-     3,091,712,352      instructions                     #    1.42  insn per cycle         
-       0.814546737 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 130
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.525703 sec
+     2,236,736,054      cycles                           #    2.823 GHz                       
+     3,119,267,572      instructions                     #    1.39  insn per cycle            
+       0.849597854 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477196e-01
-Avg ME (F77/GPU)   = 0.14771956187351573
-Relative difference = 2.5810037581511336e-07
+Avg ME (F77/GPU)   = 0.14771956605979195
+Relative difference = 2.2976103415315142e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.006175e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.025890e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.025890e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.117543e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.151188e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.151188e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.286472 sec
-INFO: No Floating Point Exceptions have been reported
-     3,808,533,169      cycles                           #    2.945 GHz                    
-     9,779,238,528      instructions                     #    2.57  insn per cycle         
-       1.294044616 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  341) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.406267 sec
+     4,043,925,432      cycles                           #    2.865 GHz                       
+     9,738,556,635      instructions                     #    2.41  insn per cycle            
+       1.412149316 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  406) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956645541506
 Relative difference = 2.270828308707201e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.477969e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.892042e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.892042e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.480932e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.914447e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.914447e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.834785 sec
-INFO: No Floating Point Exceptions have been reported
-     2,360,159,801      cycles                           #    2.803 GHz                    
-     5,954,715,990      instructions                     #    2.52  insn per cycle         
-       0.842708021 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1412) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.824504 sec
+     2,316,933,637      cycles                           #    2.792 GHz                       
+     5,851,816,983      instructions                     #    2.53  insn per cycle            
+       0.830593669 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1366) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956645541506
 Relative difference = 2.270828308707201e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.260391e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.350498e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.350498e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.246053e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.337007e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.337007e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.587860 sec
-INFO: No Floating Point Exceptions have been reported
-     1,670,861,769      cycles                           #    2.810 GHz                    
-     3,283,918,691      instructions                     #    1.97  insn per cycle         
-       0.595426943 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1567) (512y:    0) (512z:    0)
+TOTAL       :     0.582389 sec
+     1,613,472,858      cycles                           #    2.745 GHz                       
+     3,206,778,468      instructions                     #    1.99  insn per cycle            
+       0.588460320 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1531) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.348300e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.498815e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.498815e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.322435e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.481610e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.481610e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.573278 sec
-INFO: No Floating Point Exceptions have been reported
-     1,645,784,221      cycles                           #    2.835 GHz                    
-     3,247,832,958      instructions                     #    1.97  insn per cycle         
-       0.581347619 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1446) (512y:  101) (512z:    0)
+TOTAL       :     0.567372 sec
+     1,569,665,304      cycles                           #    2.742 GHz                       
+     3,175,442,225      instructions                     #    2.02  insn per cycle            
+       0.573184846 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1435) (512y:  101) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.143317e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.068862e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.068862e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.075660e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.951397e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.951397e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.613179 sec
-INFO: No Floating Point Exceptions have been reported
-     1,394,199,360      cycles                           #    2.248 GHz                    
-     2,406,597,613      instructions                     #    1.73  insn per cycle         
-       0.620673412 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  768) (512y:   64) (512z: 1063)
+TOTAL       :     0.621447 sec
+     1,359,798,497      cycles                           #    2.170 GHz                       
+     2,353,126,759      instructions                     #    1.73  insn per cycle            
+       0.627307566 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  738) (512y:   64) (512z: 1042)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt
index 3e507cd882..3f206f95bd 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:59:32
+DATE: 2025-10-11_17:01:33
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.080757e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.449829e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.987143e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.462369e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.119008e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.948835e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.520335 sec
-INFO: No Floating Point Exceptions have been reported
-     2,182,231,478      cycles                           #    2.885 GHz                    
-     3,097,447,003      instructions                     #    1.42  insn per cycle         
-       0.813407395 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.522593 sec
+     2,229,764,062      cycles                           #    2.824 GHz                       
+     3,122,707,099      instructions                     #    1.40  insn per cycle            
+       0.846718941 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477196e-01
-Avg ME (F77/GPU)   = 0.14771956187351573
-Relative difference = 2.5810037581511336e-07
+Avg ME (F77/GPU)   = 0.14771956605979195
+Relative difference = 2.2976103415315142e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.967180e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.023779e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.023779e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.222292e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.282147e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.282147e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.289771 sec
-INFO: No Floating Point Exceptions have been reported
-     3,794,201,935      cycles                           #    2.927 GHz                    
-     9,666,542,351      instructions                     #    2.55  insn per cycle         
-       1.297077628 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  359) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.390029 sec
+     4,041,827,914      cycles                           #    2.897 GHz                       
+     9,620,480,831      instructions                     #    2.38  insn per cycle            
+       1.395839351 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  401) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956645541506
 Relative difference = 2.270828308707201e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.583493e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.064503e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.064503e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.484588e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.916467e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.916467e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.784715 sec
-INFO: No Floating Point Exceptions have been reported
-     2,328,374,642      cycles                           #    2.942 GHz                    
-     5,878,440,022      instructions                     #    2.52  insn per cycle         
-       0.792155161 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1371) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.821088 sec
+     2,277,892,232      cycles                           #    2.757 GHz                       
+     5,806,859,822      instructions                     #    2.55  insn per cycle            
+       0.826926685 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1349) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956645541506
 Relative difference = 2.270828308707201e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.254464e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.329047e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.329047e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.285308e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.418349e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.418349e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.590226 sec
-INFO: No Floating Point Exceptions have been reported
-     1,689,754,472      cycles                           #    2.827 GHz                    
-     3,255,343,739      instructions                     #    1.93  insn per cycle         
-       0.598325338 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1483) (512y:    0) (512z:    0)
+TOTAL       :     0.573049 sec
+     1,611,028,972      cycles                           #    2.786 GHz                       
+     3,186,162,266      instructions                     #    1.98  insn per cycle            
+       0.579129244 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1474) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.345727e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.502859e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.502859e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.356503e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.544553e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.544553e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.572126 sec
-INFO: No Floating Point Exceptions have been reported
-     1,634,040,486      cycles                           #    2.820 GHz                    
-     3,219,951,921      instructions                     #    1.97  insn per cycle         
-       0.580193189 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1382) (512y:  101) (512z:    0)
+TOTAL       :     0.558398 sec
+     1,559,160,941      cycles                           #    2.767 GHz                       
+     3,150,562,622      instructions                     #    2.02  insn per cycle            
+       0.564070384 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1373) (512y:  101) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.168828e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.118471e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.118471e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.173215e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.148914e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.148914e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.609357 sec
-INFO: No Floating Point Exceptions have been reported
-     1,417,478,840      cycles                           #    2.299 GHz                    
-     2,399,490,515      instructions                     #    1.69  insn per cycle         
-       0.617376810 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  716) (512y:   64) (512z: 1056)
+TOTAL       :     0.596537 sec
+     1,348,900,555      cycles                           #    2.242 GHz                       
+     2,335,239,112      instructions                     #    1.73  insn per cycle            
+       0.602236132 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  687) (512y:   64) (512z: 1030)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
index 607647c622..e3ea0d9299 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:56:33
+DATE: 2025-10-11_16:57:54
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.270000e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.214418e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.893995e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.706908e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.160258e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.561103e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.542175 sec
-INFO: No Floating Point Exceptions have been reported
-     2,178,993,269      cycles                           #    2.803 GHz                    
-     3,108,059,533      instructions                     #    1.43  insn per cycle         
-       0.838052893 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.544889 sec
+     2,278,331,746      cycles                           #    2.802 GHz                       
+     3,194,429,442      instructions                     #    1.40  insn per cycle            
+       0.872956184 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015836e+00
-Avg ME (F77/GPU)   = 2.0158358666195562
-Relative difference = 6.616631711254798e-08
+Avg ME (F77/GPU)   = 2.0158358666195557
+Relative difference = 6.616631733284825e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.830273e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.876984e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.876984e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.781718e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.827404e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.827404e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.836443 sec
-INFO: No Floating Point Exceptions have been reported
-    17,247,101,824      cycles                           #    2.952 GHz                    
-    45,921,478,129      instructions                     #    2.66  insn per cycle         
-       5.842453521 seconds time elapsed
+TOTAL       :     5.994100 sec
+    17,282,311,221      cycles                           #    2.881 GHz                       
+    46,327,593,495      instructions                     #    2.68  insn per cycle            
+       5.999488168 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  622) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194407
 Relative difference = 6.616637439061751e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.179372e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.338539e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.338539e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.117362e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.271065e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.271065e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.409251 sec
-INFO: No Floating Point Exceptions have been reported
-    10,038,815,546      cycles                           #    2.940 GHz                    
-    27,809,165,185      instructions                     #    2.77  insn per cycle         
-       3.415697404 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2537) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.473625 sec
+    10,058,480,748      cycles                           #    2.892 GHz                       
+    27,928,334,913      instructions                     #    2.78  insn per cycle            
+       3.479625370 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2526) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194411
 Relative difference = 6.616637417031725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.016017e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.397611e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.397611e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.891803e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.272223e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.272223e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.202025 sec
-INFO: No Floating Point Exceptions have been reported
-     6,083,216,423      cycles                           #    2.757 GHz                    
-    12,595,496,799      instructions                     #    2.07  insn per cycle         
-       2.208459235 seconds time elapsed
+TOTAL       :     2.253673 sec
+     6,113,479,898      cycles                           #    2.707 GHz                       
+    12,619,681,498      instructions                     #    2.06  insn per cycle            
+       2.259543422 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2620) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.491994e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.947919e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.947919e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.064851e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.470121e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.470121e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.018419 sec
-INFO: No Floating Point Exceptions have been reported
-     5,588,215,007      cycles                           #    2.761 GHz                    
-    12,004,808,489      instructions                     #    2.15  insn per cycle         
-       2.024606102 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2365) (512y:  144) (512z:    0)
+TOTAL       :     2.179283 sec
+     5,867,669,279      cycles                           #    2.687 GHz                       
+    12,194,655,166      instructions                     #    2.08  insn per cycle            
+       2.184803472 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2417) (512y:  124) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.529303e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.713663e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.713663e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.394256e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.568035e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.568035e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.082414 sec
-INFO: No Floating Point Exceptions have been reported
-     5,763,724,377      cycles                           #    1.867 GHz                    
-     8,350,228,242      instructions                     #    1.45  insn per cycle         
-       3.088879573 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1468) (512y:  122) (512z: 1806)
+TOTAL       :     3.199079 sec
+     5,758,256,477      cycles                           #    1.797 GHz                       
+     8,312,435,809      instructions                     #    1.44  insn per cycle            
+       3.204885362 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1464) (512y:  100) (512z: 1805)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt
index 3ed4c3c5ff..85796cb2e8 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:56:58
+DATE: 2025-10-11_16:58:23
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.306886e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.297289e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.977845e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.750318e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.090521e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.471741e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.530204 sec
-INFO: No Floating Point Exceptions have been reported
-     2,211,323,980      cycles                           #    2.884 GHz                    
-     3,201,430,578      instructions                     #    1.45  insn per cycle         
-       0.823926524 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 212
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.536193 sec
+     2,280,468,803      cycles                           #    2.831 GHz                       
+     3,171,048,990      instructions                     #    1.39  insn per cycle            
+       0.862856350 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015836e+00
-Avg ME (F77/GPU)   = 2.0158358666195562
-Relative difference = 6.616631711254798e-08
+Avg ME (F77/GPU)   = 2.0158358666195557
+Relative difference = 6.616631733284825e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.872475e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.921622e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.921622e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.830968e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.879197e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.879197e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.706755 sec
-INFO: No Floating Point Exceptions have been reported
-    16,797,600,798      cycles                           #    2.941 GHz                    
-    44,912,592,336      instructions                     #    2.67  insn per cycle         
-       5.712473159 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  566) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.834979 sec
+    16,842,100,019      cycles                           #    2.884 GHz                       
+    45,296,854,647      instructions                     #    2.69  insn per cycle            
+       5.840673910 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  567) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194411
 Relative difference = 6.616637417031725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.376254e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.552215e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.552215e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.286582e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.457425e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.457425e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.215396 sec
-INFO: No Floating Point Exceptions have been reported
-     9,523,990,060      cycles                           #    2.957 GHz                    
-    26,686,144,259      instructions                     #    2.80  insn per cycle         
-       3.221864250 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2326) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.299071 sec
+     9,574,991,301      cycles                           #    2.898 GHz                       
+    26,751,055,486      instructions                     #    2.79  insn per cycle            
+       3.304842345 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2312) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194411
 Relative difference = 6.616637417031725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.628485e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.953785e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.953785e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.483668e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.795787e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.795787e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.376456 sec
-INFO: No Floating Point Exceptions have been reported
-     6,603,885,103      cycles                           #    2.772 GHz                    
-    14,117,515,687      instructions                     #    2.14  insn per cycle         
-       2.382952116 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2705) (512y:    0) (512z:    0)
+TOTAL       :     2.446633 sec
+     6,630,126,092      cycles                           #    2.705 GHz                       
+    14,155,939,252      instructions                     #    2.14  insn per cycle            
+       2.452232412 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2708) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.799064e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.148539e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.148539e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.633646e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.966509e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.966509e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.297050 sec
-INFO: No Floating Point Exceptions have been reported
-     6,386,723,525      cycles                           #    2.773 GHz                    
-    13,726,619,432      instructions                     #    2.15  insn per cycle         
-       2.304339219 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2356) (512y:  298) (512z:    0)
+TOTAL       :     2.371147 sec
+     6,420,781,885      cycles                           #    2.703 GHz                       
+    13,756,522,591      instructions                     #    2.14  insn per cycle            
+       2.376767940 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2358) (512y:  297) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.339110e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.504311e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.504311e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.247851e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.404590e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.404590e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.254444 sec
-INFO: No Floating Point Exceptions have been reported
-     5,974,020,045      cycles                           #    1.833 GHz                    
-    10,122,964,274      instructions                     #    1.69  insn per cycle         
-       3.261538649 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1318) (512y:  208) (512z: 1986)
+TOTAL       :     3.336819 sec
+     5,939,444,089      cycles                           #    1.778 GHz                       
+    10,130,416,003      instructions                     #    1.71  insn per cycle            
+       3.342426568 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1321) (512y:  208) (512z: 1987)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
index 7bd4c9bca6..e92931017f 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:58:13
+DATE: 2025-10-11_16:59:57
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.178914e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.740854e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.866078e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.265470e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.796248e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.925275e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072877e+00 +- 3.361153e-03 )  GeV^0
-TOTAL       :     0.492488 sec
-INFO: No Floating Point Exceptions have been reported
-     2,067,407,730      cycles                           #    2.879 GHz                    
-     2,921,575,837      instructions                     #    1.41  insn per cycle         
-       0.777094459 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 125
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.494715 sec
+     2,133,928,532      cycles                           #    2.829 GHz                       
+     2,961,237,291      instructions                     #    1.39  insn per cycle            
+       0.812186327 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 97
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015841e+00
-Avg ME (F77/GPU)   = 2.0158787037944421
-Relative difference = 1.870375413642407e-05
+Avg ME (F77/GPU)   = 2.0158787077525631
+Relative difference = 1.870571764492604e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.933137e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.988210e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.988210e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.878391e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.930853e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.930853e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361545e-03 )  GeV^0
-TOTAL       :     5.510560 sec
-INFO: No Floating Point Exceptions have been reported
-    16,216,363,781      cycles                           #    2.940 GHz                    
-    45,321,064,348      instructions                     #    2.79  insn per cycle         
-       5.516237540 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  600) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.670408 sec
+    16,367,724,454      cycles                           #    2.885 GHz                       
+    45,532,008,663      instructions                     #    2.78  insn per cycle            
+       5.675967017 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  605) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015849e+00
 Avg ME (F77/C++)    = 2.0158491701586172
 Relative difference = 8.441039850630506e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.554782e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.893509e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.893509e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.407671e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.731067e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.731067e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361544e-03 )  GeV^0
-TOTAL       :     2.389253 sec
-INFO: No Floating Point Exceptions have been reported
-     7,056,712,623      cycles                           #    2.947 GHz                    
-    17,792,064,584      instructions                     #    2.52  insn per cycle         
-       2.395009745 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3147) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.467869 sec
+     7,095,747,201      cycles                           #    2.870 GHz                       
+    17,858,347,842      instructions                     #    2.52  insn per cycle            
+       2.473312825 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3126) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015849e+00
 Avg ME (F77/C++)    = 2.0158486895961687
 Relative difference = 1.539816876576819e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.351394e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.496890e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.496890e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.089358e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.160867e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.160867e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.343765 sec
-INFO: No Floating Point Exceptions have been reported
-     3,745,450,403      cycles                           #    2.777 GHz                    
-     8,262,540,860      instructions                     #    2.21  insn per cycle         
-       1.349671424 seconds time elapsed
+TOTAL       :     1.384690 sec
+     3,760,865,125      cycles                           #    2.707 GHz                       
+     8,296,401,814      instructions                     #    2.21  insn per cycle            
+       1.390188663 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3371) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015847e+00
 Avg ME (F77/C++)    = 2.0158474864438176
 Relative difference = 2.4130988992271984e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.821818e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.011140e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.011140e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.420631e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.588852e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.588852e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.275053 sec
-INFO: No Floating Point Exceptions have been reported
-     3,558,622,083      cycles                           #    2.780 GHz                    
-     7,915,407,710      instructions                     #    2.22  insn per cycle         
-       1.280856743 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3214) (512y:   20) (512z:    0)
+TOTAL       :     1.334053 sec
+     3,653,512,814      cycles                           #    2.729 GHz                       
+     8,025,167,005      instructions                     #    2.20  insn per cycle            
+       1.339479555 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3272) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015847e+00
 Avg ME (F77/C++)    = 2.0158474864438176
 Relative difference = 2.4130988992271984e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.584138e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.256759e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.256759e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.300716e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.921877e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.921877e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.679646 sec
-INFO: No Floating Point Exceptions have been reported
-     3,255,689,642      cycles                           #    1.933 GHz                    
-     6,101,216,288      instructions                     #    1.87  insn per cycle         
-       1.685383243 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2258) (512y:   22) (512z: 2156)
+TOTAL       :     1.752788 sec
+     3,290,640,509      cycles                           #    1.873 GHz                       
+     6,097,403,848      instructions                     #    1.85  insn per cycle            
+       1.758187036 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2262) (512y:    0) (512z: 2152)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015848e+00
 Avg ME (F77/C++)    = 2.0158476348733529
 Relative difference = 1.8112806478434436e-07
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt
index bd2def4f48..890303a8f4 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:58:33
+DATE: 2025-10-11_17:00:25
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.136229e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.747823e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.880709e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.221580e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.787567e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.918978e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072877e+00 +- 3.361153e-03 )  GeV^0
-TOTAL       :     0.488528 sec
-INFO: No Floating Point Exceptions have been reported
-     2,057,813,122      cycles                           #    2.874 GHz                    
-     2,903,563,490      instructions                     #    1.41  insn per cycle         
-       0.774040886 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.494192 sec
+     2,133,895,255      cycles                           #    2.826 GHz                       
+     2,984,971,388      instructions                     #    1.40  insn per cycle            
+       0.812316425 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015841e+00
-Avg ME (F77/GPU)   = 2.0158787037944421
-Relative difference = 1.870375413642407e-05
+Avg ME (F77/GPU)   = 2.0158787077525631
+Relative difference = 1.870571764492604e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.970300e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.026987e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.026987e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.920936e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.975706e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.975706e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361545e-03 )  GeV^0
-TOTAL       :     5.407589 sec
-INFO: No Floating Point Exceptions have been reported
-    15,991,185,925      cycles                           #    2.955 GHz                    
-    44,429,993,623      instructions                     #    2.78  insn per cycle         
-       5.412895968 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  533) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.545042 sec
+    16,055,557,680      cycles                           #    2.893 GHz                       
+    44,606,147,249      instructions                     #    2.78  insn per cycle            
+       5.550363279 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  534) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015849e+00
 Avg ME (F77/C++)    = 2.0158491701586172
 Relative difference = 8.441039850630506e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.328908e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.798682e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.798682e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.166744e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.616602e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.616602e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361544e-03 )  GeV^0
-TOTAL       :     2.053409 sec
-INFO: No Floating Point Exceptions have been reported
-     6,061,427,520      cycles                           #    2.945 GHz                    
-    17,076,312,832      instructions                     #    2.82  insn per cycle         
-       2.059026016 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2862) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.117207 sec
+     6,107,535,010      cycles                           #    2.878 GHz                       
+    17,151,265,141      instructions                     #    2.81  insn per cycle            
+       2.122735579 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2860) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015849e+00
 Avg ME (F77/C++)    = 2.0158486895961687
 Relative difference = 1.539816876576819e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.019252e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.594125e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.594125e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.890362e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.440713e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.440713e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.827330 sec
-INFO: No Floating Point Exceptions have been reported
-     5,036,041,688      cycles                           #    2.749 GHz                    
-    10,223,391,747      instructions                     #    2.03  insn per cycle         
-       1.833165934 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3906) (512y:    0) (512z:    0)
+TOTAL       :     1.868040 sec
+     5,037,008,594      cycles                           #    2.691 GHz                       
+    10,256,105,804      instructions                     #    2.04  insn per cycle            
+       1.873591030 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3910) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015847e+00
 Avg ME (F77/C++)    = 2.0158474864438176
 Relative difference = 2.4130988992271984e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.156943e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.756865e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.756865e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.987209e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.558432e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.558432e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.789449 sec
-INFO: No Floating Point Exceptions have been reported
-     4,972,642,094      cycles                           #    2.772 GHz                    
-     9,995,367,434      instructions                     #    2.01  insn per cycle         
-       1.795052964 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3805) (512y:    2) (512z:    0)
+TOTAL       :     1.838312 sec
+     4,976,298,083      cycles                           #    2.700 GHz                       
+    10,027,200,665      instructions                     #    2.01  insn per cycle            
+       1.843999254 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3807) (512y:    2) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015847e+00
 Avg ME (F77/C++)    = 2.0158474864438176
 Relative difference = 2.4130988992271984e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.670992e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.000057e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.000057e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.543540e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.857388e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.857388e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     2.331763 sec
-INFO: No Floating Point Exceptions have been reported
-     4,369,500,962      cycles                           #    1.870 GHz                    
-     8,444,287,674      instructions                     #    1.93  insn per cycle         
-       2.337616992 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2744) (512y:    4) (512z: 2754)
+TOTAL       :     2.395195 sec
+     4,386,171,031      cycles                           #    1.828 GHz                       
+     8,457,161,359      instructions                     #    1.93  insn per cycle            
+       2.400661750 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2747) (512y:    4) (512z: 2749)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015848e+00
 Avg ME (F77/C++)    = 2.0158476348733529
 Relative difference = 1.8112806478434436e-07
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
index 9029ad668b..2e4f76055c 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:57:23
+DATE: 2025-10-11_16:58:53
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.278122e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.299718e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.972605e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.803206e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.197061e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.595248e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.535533 sec
-INFO: No Floating Point Exceptions have been reported
-     2,218,013,615      cycles                           #    2.871 GHz                    
-     3,167,587,965      instructions                     #    1.43  insn per cycle         
-       0.830721869 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.542499 sec
+     2,291,067,565      cycles                           #    2.822 GHz                       
+     3,214,215,859      instructions                     #    1.40  insn per cycle            
+       0.903410898 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015836e+00
-Avg ME (F77/GPU)   = 2.0158358639104246
-Relative difference = 6.751024171044779e-08
+Avg ME (F77/GPU)   = 2.0158359218521276
+Relative difference = 3.876697936613229e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.807535e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.852925e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.852925e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.773351e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.818033e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.818033e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.910224 sec
-INFO: No Floating Point Exceptions have been reported
-    17,388,420,068      cycles                           #    2.940 GHz                    
-    46,077,588,135      instructions                     #    2.65  insn per cycle         
-       5.916245730 seconds time elapsed
+TOTAL       :     6.022953 sec
+    17,468,685,186      cycles                           #    2.898 GHz                       
+    46,428,017,151      instructions                     #    2.66  insn per cycle            
+       6.028694923 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  622) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359218686011
 Relative difference = 3.8758807327712803e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.226882e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.387878e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.387878e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.098858e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.251324e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.251324e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.360909 sec
-INFO: No Floating Point Exceptions have been reported
-     9,940,043,952      cycles                           #    2.953 GHz                    
-    27,598,360,403      instructions                     #    2.78  insn per cycle         
-       3.367569953 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.494063 sec
+    10,018,252,515      cycles                           #    2.863 GHz                       
+    27,545,325,597      instructions                     #    2.75  insn per cycle            
+       3.499809973 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359218686011
 Relative difference = 3.8758807327712803e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.038546e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.426797e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.426797e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.882400e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.252051e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.252051e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.194996 sec
-INFO: No Floating Point Exceptions have been reported
-     6,084,814,623      cycles                           #    2.765 GHz                    
-    12,511,133,896      instructions                     #    2.06  insn per cycle         
-       2.201688699 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2776) (512y:    0) (512z:    0)
+TOTAL       :     2.257811 sec
+     5,988,198,927      cycles                           #    2.647 GHz                       
+    12,439,095,003      instructions                     #    2.08  insn per cycle            
+       2.263664182 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2756) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.589922e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.068248e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.068248e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.259591e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.697101e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.697101e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     1.988387 sec
-INFO: No Floating Point Exceptions have been reported
-     5,540,380,764      cycles                           #    2.778 GHz                    
-    11,938,541,192      instructions                     #    2.15  insn per cycle         
-       1.995322896 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2521) (512y:  146) (512z:    0)
+TOTAL       :     2.102985 sec
+     5,735,490,837      cycles                           #    2.721 GHz                       
+    12,004,650,662      instructions                     #    2.09  insn per cycle            
+       2.108573871 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2556) (512y:  126) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.615006e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.807457e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.807457e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.518029e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.702687e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.702687e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.015683 sec
-INFO: No Floating Point Exceptions have been reported
-     5,630,115,254      cycles                           #    1.863 GHz                    
-     8,130,918,173      instructions                     #    1.44  insn per cycle         
-       3.022730001 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1671) (512y:  126) (512z: 1865)
+TOTAL       :     3.089670 sec
+     5,573,654,696      cycles                           #    1.801 GHz                       
+     7,983,962,804      instructions                     #    1.43  insn per cycle            
+       3.095529304 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1645) (512y:  104) (512z: 1826)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt
index 44aa1a6a94..09594959d7 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,210 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:57:48
+DATE: 2025-10-11_16:59:25
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.308177e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.314026e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.965515e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.800950e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.127229e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.485215e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.530653 sec
-INFO: No Floating Point Exceptions have been reported
-     2,220,013,015      cycles                           #    2.891 GHz                    
-     3,185,773,009      instructions                     #    1.44  insn per cycle         
-       0.824701846 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 212
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.537601 sec
+     2,294,644,932      cycles                           #    2.834 GHz                       
+     3,202,661,173      instructions                     #    1.40  insn per cycle            
+       0.866738405 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168
+==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015836e+00
-Avg ME (F77/GPU)   = 2.0158358639104246
-Relative difference = 6.751024171044779e-08
+Avg ME (F77/GPU)   = 2.0158359218521276
+Relative difference = 3.876697936613229e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.857128e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.905464e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.905464e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.809865e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.856790e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.856790e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.753526 sec
-INFO: No Floating Point Exceptions have been reported
-    16,958,834,547      cycles                           #    2.945 GHz                    
-    45,095,701,979      instructions                     #    2.66  insn per cycle         
-       5.759360611 seconds time elapsed
+TOTAL       :     5.902916 sec
+    17,031,724,118      cycles                           #    2.883 GHz                       
+    45,397,065,381      instructions                     #    2.67  insn per cycle            
+       5.908631173 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  567) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359218686011
 Relative difference = 3.8758807327712803e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.365466e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.544754e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.544754e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.294098e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.465793e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.465793e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.232551 sec
-INFO: No Floating Point Exceptions have been reported
-     9,533,065,833      cycles                           #    2.943 GHz                    
-    26,273,852,197      instructions                     #    2.76  insn per cycle         
-       3.239846074 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2386) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.291976 sec
+     9,561,103,669      cycles                           #    2.900 GHz                       
+    26,144,822,297      instructions                     #    2.73  insn per cycle            
+       3.297670541 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2347) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359218686011
 Relative difference = 3.8758807327712803e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.514012e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.821697e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.821697e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.426643e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.734905e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.734905e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.435584 sec
-INFO: No Floating Point Exceptions have been reported
-     6,758,526,375      cycles                           #    2.768 GHz                    
-    14,047,168,742      instructions                     #    2.08  insn per cycle         
-       2.442338814 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2895) (512y:    0) (512z:    0)
+TOTAL       :     2.478214 sec
+     6,700,126,016      cycles                           #    2.700 GHz                       
+    13,943,282,534      instructions                     #    2.08  insn per cycle            
+       2.483989370 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2871) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.791737e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.138604e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.138604e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.620283e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.949819e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.949819e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.301242 sec
-INFO: No Floating Point Exceptions have been reported
-     6,403,253,635      cycles                           #    2.776 GHz                    
-    13,529,712,107      instructions                     #    2.11  insn per cycle         
-       2.307614270 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2531) (512y:  302) (512z:    0)
+TOTAL       :     2.378094 sec
+     6,404,718,099      cycles                           #    2.688 GHz                       
+    13,458,943,081      instructions                     #    2.10  insn per cycle            
+       2.383779382 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2508) (512y:  302) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.627313e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.823087e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.823087e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.539955e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.726603e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.726603e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.002431 sec
-INFO: No Floating Point Exceptions have been reported
-     5,614,669,392      cycles                           #    1.866 GHz                    
-     9,218,497,811      instructions                     #    1.64  insn per cycle         
-       3.009264991 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1456) (512y:  212) (512z: 2059)
+TOTAL       :     3.070043 sec
+     5,557,581,294      cycles                           #    1.808 GHz                       
+     9,121,741,259      instructions                     #    1.64  insn per cycle            
+       3.075761617 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1425) (512y:  212) (512z: 2027)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
diff --git a/epochX/cudacpp/tput/teeThroughputX.sh b/epochX/cudacpp/tput/teeThroughputX.sh
index 088371cb95..c4180b6725 100755
--- a/epochX/cudacpp/tput/teeThroughputX.sh
+++ b/epochX/cudacpp/tput/teeThroughputX.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 scrdir=$(cd $(dirname $0); pwd)
 bckend=$(basename $(cd $scrdir; cd ..; pwd)) # cudacpp or alpaka
@@ -10,7 +10,7 @@ cd $scrdir
 
 function usage()
 {
-  echo "Usage: $0 <processes [-eemumu][-ggtt][-ggttg][-ggttgg][-ggttggg][-gqttq][-heftggbb][-susyggtt][-susyggt1t1][-smeftggtttt]> [-nocuda] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-makeonly] [-makeclean] [-makej] [-dlp <dyld_library_path>]" # -nofpe is no longer supported
+  echo "Usage: $0 <processes [-eemumu][-ggtt][-ggttg][-ggttgg][-ggttggg][-gqttq][-heftggbb][-susyggtt][-susyggt1t1][-smeftggtttt]> [-nocuda] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-noBlas|-blasOn] [-makeonly] [-makeclean] [-makej] [-scaling] [-dlp <dyld_library_path>]" # -nofpe is no longer supported
   exit 1
 }
 
@@ -33,8 +33,10 @@ helinls="0"
 hrdcods="0"
 rndgen=
 rmbsmp=
+blas="" # build with blas but disable it at runtime
 steps="make test"
 makej=
+scaling=
 ###nofpe=
 dlp=
 dlpset=0
@@ -117,6 +119,12 @@ for arg in $*; do
     rmbsmp=$arg
   elif [ "$arg" == "-bridge" ]; then
     rmbsmp=$arg
+  elif [ "$arg" == "-noBlas" ]; then # build with blas but disable it at runtime
+    if [ "${blas}" == "-blasOn" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi
+    blas=$arg
+  elif [ "$arg" == "-blasOn" ]; then # build with blas and enable it at runtime
+    if [ "${blas}" == "-noBlas" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi
+    blas=$arg
   elif [ "$arg" == "-makeonly" ]; then
     if [ "${steps}" == "make test" ]; then
       steps="make"
@@ -131,6 +139,8 @@ for arg in $*; do
     fi
   elif [ "$arg" == "-makej" ]; then
     makej=-makej
+  elif [ "$arg" == "-scaling" ]; then
+    scaling=$arg
   ###elif [ "$arg" == "-nofpe" ]; then
   ###  nofpe=-nofpe
   else
@@ -175,6 +185,8 @@ for step in $steps; do
             args="${args} ${alpaka}" # optionally disable alpaka tests
             args="${args} ${rndgen}" # optionally use common random numbers or curand on host
             args="${args} ${rmbsmp}" # optionally use rambo or bridge on host
+            args="${args} ${scaling}" # optionally run scaling tests
+            args="${args} ${blas}" # optionally build with no blas or instead enable it at runtime
             ###args="${args} ${nofpe}" # optionally disable FPEs
             args="${args} ${bldall}" # avx, fptype, helinl and hrdcod are now supported for all processes
             if [ "${step}" == "makeclean" ]; then
@@ -191,6 +203,8 @@ for step in $steps; do
               logfile=logs_${proc#-}_${sufflog}/log_${proc#-}_${sufflog}_${fptype}_inl${helinl}_hrd${hrdcod}.txt
               if [ "${rndgen}" != "" ]; then logfile=${logfile%.txt}_${rndgen#-}.txt; fi
               if [ "${rmbsmp}" != "" ]; then logfile=${logfile%.txt}_${rmbsmp#-}.txt; fi
+              if [ "${blas}" != "" ]; then logfile=${logfile%.txt}_${blas#-}.txt; fi
+              if [ "${scaling}" != "" ]; then logfile=${logfile%.txt}.scaling; fi
               printf "\n%80s\n" |tr " " "*"
               printf "*** ./throughputX.sh $args | tee $logfile"
               printf "\n%80s\n" |tr " " "*"
diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh
index 68df662f58..5d870a48ab 100755
--- a/epochX/cudacpp/tput/throughputX.sh
+++ b/epochX/cudacpp/tput/throughputX.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 set +x # not verbose
 set -e # fail on error
@@ -19,7 +19,7 @@ export MG5AMC_CHANNELID_DEBUG=1
 
 function usage()
 {
-  echo "Usage: $0 <processes [-eemumu][-ggtt][-ggttg][-ggttgg][-ggttggg][-gqttq][-heftggbb][-susyggtt][-susyggt1t1][-smeftggtttt]> [-bldall|-nocuda|-cpponly|-cudaonly|-hiponly|-noneonly|-sse4only|-avx2only|-512yonly|-512zonly] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-omp] [-makeonly|-makeclean|-makecleanonly|-dryrun] [-makej] [-3a3b] [-div] [-req] [-detailed] [-gtest(default)|-nogtest] [-v] [-dlp <dyld_library_path>]" # -nofpe is no longer supported
+  echo "Usage: $0 <processes [-eemumu][-ggtt][-ggttg][-ggttgg][-ggttggg][-gqttq][-heftggbb][-susyggtt][-susyggt1t1][-smeftggtttt]> [-bldall|-nocuda|-cpponly|-cudaonly|-hiponly|-noneonly|-sse4only|-avx2only|-512yonly|-512zonly] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-noBlas|-blasOn] [-omp] [-makeonly|-makeclean|-makecleanonly|-dryrun] [-makej] [-3a3b] [-div] [-req] [-detailed] [-gtest(default)|-nogtest] [-scaling] [-v] [-dlp <dyld_library_path>]" # -nofpe is no longer supported
   exit 1
 }
 
@@ -49,7 +49,9 @@ fptypes="m" # new default #995 (was "d")
 helinls="0"
 hrdcods="0"
 rndgen=""
-rmbsam=""
+rmbsmp=""
+
+blas="" # build with blas but disable it at runtime
 
 maketype=
 makej=
@@ -59,6 +61,7 @@ div=0
 req=0
 detailed=0
 gtest=
+scaling=0
 ###nofpe=0
 verbose=0
 
@@ -211,6 +214,14 @@ while [ "$1" != "" ]; do
   elif [ "$1" == "-bridge" ]; then
     rmbsmp=" -${1}"
     shift
+  elif [ "$1" == "-noBlas" ]; then # build without blas
+    if [ "${blas}" == "-blasOn" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi
+    blas=$1
+    shift
+  elif [ "$1" == "-blasOn" ]; then # build with blas and enable it at runtime
+    if [ "${blas}" == "-noBlas" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi
+    blas=$1    
+    shift
   elif [ "$1" == "-makeonly" ] || [ "$1" == "-makeclean" ] || [ "$1" == "-makecleanonly" ] || [ "$1" == "-dryrun" ]; then
     if [ "${maketype}" != "" ] && [ "${maketype}" != "$1" ]; then
       echo "ERROR! Options -makeonly, -makeclean, -makecleanonly and -dryrun are incompatible"; usage
@@ -245,6 +256,9 @@ while [ "$1" != "" ]; do
     fi
     gtest=0
     shift
+  elif [ "$1" == "-scaling" ]; then
+    scaling=1
+    shift
   ###elif [ "$1" == "-nofpe" ]; then
   ###  nofpe=1
   ###  shift
@@ -371,6 +385,9 @@ function showdir()
   echo $dir
 }
 
+echo MADGRAPH_CUDA_ARCHITECTURE=${MADGRAPH_CUDA_ARCHITECTURE}
+echo MADGRAPH_HIP_ARCHITECTURE=${MADGRAPH_HIP_ARCHITECTURE}
+
 ###echo -e "\n********************************************************************************\n"
 printf "\n"
 
@@ -434,6 +451,13 @@ done
 # PART 2 - build the executables which should be run
 ##########################################################################
 
+if [ "${blas}" == "-noBlas" ]; then
+  export HASBLAS=hasNoBlas
+else
+  export HASBLAS=hasBlas
+fi
+echo HASBLAS=${HASBLAS}
+
 unset GTEST_ROOT
 unset LOCALGTEST
 
@@ -497,6 +521,18 @@ if [ "${maketype}" != "-dryrun" ]; then
   printf "DATE: $(date '+%Y-%m-%d_%H:%M:%S')\n\n"
 fi
 
+echo HASBLAS=${HASBLAS}
+
+if [ "${blas}" == "-blasOn" ]; then
+  export CUDACPP_RUNTIME_BLASCOLORSUM=1
+else
+  unset CUDACPP_RUNTIME_BLASCOLORSUM
+fi
+echo CUDACPP_RUNTIME_BLASCOLORSUM=${CUDACPP_RUNTIME_BLASCOLORSUM}
+
+unset CUDACPP_RUNTIME_CUBLASTF32TENSOR
+echo CUDACPP_RUNTIME_CUBLASTF32TENSOR=${CUDACPP_RUNTIME_CUBLASTF32TENSOR}
+
 function runExe() {
   exe1=$1
   args="$2"
@@ -507,6 +543,7 @@ function runExe() {
   # Optionally add other patterns here for some specific configurations (e.g. clang)
   if [ "${exe1%%/check_cuda*}" != "${exe1}" ] || [ "${exe1%%/check_hip*}" != "${exe1}" ]; then pattern="${pattern}|EvtsPerSec\[Matrix"; fi
   pattern="${pattern}|Workflow"
+  ###pattern="${pattern}|BLASCOLORSUM"
   ###pattern="${pattern}|CUCOMPLEX"
   ###pattern="${pattern}|COMMON RANDOM|CURAND HOST \(CUDA"
   pattern="${pattern}|ERROR"
@@ -523,7 +560,7 @@ function runExe() {
     if [ "${detailed}" == "1" ]; then pattern="${pattern}|#"; fi
     if [ "${verbose}" == "1" ]; then set -x; fi
     ###perf stat -d $exe1 $args 2>&1 | grep -v "Performance counter stats"
-    perf stat -d $exe1 $args 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats"
+    perf stat -d $exe1 $args 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats" |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/'
     set +x
   else
     # -- Older version using time
@@ -539,6 +576,7 @@ function runTest() {
   echo "runTest $exe1"
   if [ "${maketype}" == "-dryrun" ]; then return; fi
   pattern="PASS|FAIL"
+  ###pattern="${pattern}|BLASCOLORSUM"
   pattern="${pattern}|ERROR"
   pattern="${pattern}|WARNING"
   pattern="${pattern}|Floating Point Exception"
@@ -563,10 +601,12 @@ function cmpExe() {
     echo "ERROR! C++ calculation (C++${tag} failed"; exit 1 # expose FPE crash #1003 on HIP
   fi
   me1=$(cat ${tmp1} | grep MeanMatrix | awk '{print $4}'); cat ${tmp2}
+  ###cat ${tmp1} | grep BLASCOLORSUM
   if ! ${exef} ${argsf} 2>${tmp2} >${tmp1}; then
     echo "ERROR! Fortran calculation (F77${tag} failed"; exit 1
   fi
   me2=$(cat ${tmp1} | grep Average | awk '{print $4}'); cat ${tmp2}
+  ###cat ${tmp1} | grep BLASCOLORSUM
   echo -e "Avg ME (C++${tag}   = ${me1}\nAvg ME (F77${tag}   = ${me2}"
   if [ "${me2}" == "NaN" ]; then
     echo "ERROR! Fortran calculation (F77${tag} returned NaN"; exit 1
@@ -588,16 +628,23 @@ function runNcu() {
   args="$2"
   args="$args$rndgen$rmbsmp"
   echo "runNcu $exe1 $args"
-  if [ "${verbose}" == "1" ]; then set -x; fi
-  #$(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin|registers| sm)' | tr "\n" " " | awk '{print $1, $2, $3, $15, $17; print $1, $2, $3, $18, $20$19}'
-  set +e # do not fail on error
-  out=$($(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args)
-  echo "$out" | egrep '(ERROR|WARNING)' # NB must escape $out in between quotes
-  set -e # fail on error (after ncu and after egrep!)
-  out=$(echo "${out}" | egrep '(sigmaKin|registers| sm)' | tr "\n" " ") # NB must escape $out in between quotes
-  echo $out | awk -v key1="launch__registers_per_thread" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)}; print $1, $2, $3, key1, val1}'
-  echo $out | awk -v key1="sm__sass_average_branch_targets_threads_uniform.pct" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)$(i+1)}; print $1, $2, $3, key1, val1}'
-  set +x
+  ###echoblas=1
+  kernels="calculate_jamps color_sum_kernel"
+  ###if [ "${CUDACPP_RUNTIME_BLASCOLORSUM}" == "1" ]; then kernels="$kernels kernel"; fi # heavy to profile...
+  ###if [ "${CUDACPP_RUNTIME_BLASCOLORSUM}" == "1" ]; then kernels="$kernels regex:gemm"; fi # output to be improved...
+  for kernel in $kernels; do
+    if [ "${verbose}" == "1" ]; then set -x; fi
+    #$(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::${kernel}:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps|registers| sm)' | tr "\n" " " | awk '{print $1, $2, $3, $15, $17; print $1, $2, $3, $18, $20$19}'
+    set +e # do not fail on error
+    out=$($(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::${kernel}:" --kernel-name-base function $exe1 $args)
+    echo "$out" | egrep '(ERROR|WARNING)' # NB must escape $out in between quotes
+    ###if [ "${echoblas}" == "1" ]; then echo "$out" | egrep '(BLASCOLORSUM)'; echoblas=0; fi
+    set -e # fail on error (after ncu and after egrep!)
+    out=$(echo "${out}" | egrep "(${kernel}|registers| sm)" | tr "\n" " ") # NB must escape $out in between quotes
+    echo $out | awk -v key1="launch__registers_per_thread" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)}; print $1, $2, $3, key1, val1}'
+    echo $out | awk -v key1="sm__sass_average_branch_targets_threads_uniform.pct" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)$(i+1)}; print $1, $2, $3, key1, val1}'
+    set +x
+  done
 }
 
 # Profile divergence metrics more in detail
@@ -613,11 +660,11 @@ function runNcuDiv() {
   ###echo "runNcuDiv $exe1 $args"
   if [ "${verbose}" == "1" ]; then set -x; fi
   ###$(which ncu) --query-metrics $exe1 $args
-  ###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args
-  ###$(which ncu) --metrics regex:.*stalled_barrier.* --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args
-  ###$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}'
-  #$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %-10s %s\n", "", $18, $19, $22$21; printf "%29s: %-51s %-10s %s\n", "", $28, $29, $32$31; printf "%29s: %-51s %-10s %s\n", "", $23, $24, $27$26; printf "%29s: %-51s %s\n", "", $33, $35}'
-  out=$($(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " ")
+  ###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args
+  ###$(which ncu) --metrics regex:.*stalled_barrier.* --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args
+  ###$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}'
+  #$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %-10s %s\n", "", $18, $19, $22$21; printf "%29s: %-51s %-10s %s\n", "", $28, $29, $32$31; printf "%29s: %-51s %-10s %s\n", "", $23, $24, $27$26; printf "%29s: %-51s %s\n", "", $33, $35}'
+  out=$($(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " ")
   ###echo $out
   echo $out | awk -v key1="smsp__sass_branch_targets.sum" '{key2=key1".per_second"; val1="N/A"; val2=""; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+1); if ($i==key2 && $(i+1)!="(!)") val2=$(i+2)$(i+1)}; printf "%29s: %-51s %-10s %s\n", "", key1, val1, val2}'
   echo $out | awk -v key1="smsp__sass_branch_targets_threads_uniform.sum" '{key2=key1".per_second"; val1="N/A"; val2=""; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+1); if ($i==key2 && $(i+1)!="(!)") val2=$(i+2)$(i+1)}; printf "%29s: %-51s %-10s %s\n", "", key1, val1, val2}'
@@ -637,7 +684,7 @@ function runNcuReq() {
   for args in "-p 1 1 1" "-p 1 4 1" "-p 1 8 1" "-p 1 32 1" "$ncuArgs"; do
     ###echo "runNcuReq $exe1 $args"
     # NB This will print nothing if $args are invalid (eg "-p 1 4 1" when neppR=8)
-    $(which ncu) --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin|registers| sm|l1tex)' | tr "\n" " " | awk -vtag="[$args]" '{print $1, $2, $3, $16"s", $17";", $19"s", $20, tag}'
+    $(which ncu) --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps|registers| sm|l1tex)' | tr "\n" " " | awk -vtag="[$args]" '{print $1, $2, $3, $16"s", $17";", $19"s", $20, tag}'
   done
   set +x
 }
@@ -659,10 +706,19 @@ else
 fi
 echo -e "On $HOSTNAME [CPU: $cpuTxt] [GPU: $gpuTxt]:"
 
+# Configure scaling tests
+if [ "${scaling}" == "0" ]; then # no scaling tests (throughput tests only)
+  exesSc=
+elif [ "${scaling}" == "1" ]; then # scaling tests only (skip throughput tests)
+  exesSc=$exes
+  exes=
+fi
+
 # These two settings are needed by BMK containers: do not change them
 BMKEXEARGS="" # if BMKEXEARGS is set, exeArgs is set equal to BMKEXEARGS, while exeArgs2 is set to ""
 BMKMULTIPLIER=1 # the pre-defined numbers of iterations (including those in BMKEXEARGS) are multiplied by BMKMULTIPLIER
 
+# (1) TRADITIONAL THROUGHPUT TESTS
 ###lastExe=
 lastExeDir=
 ###echo "exes=$exes"
@@ -726,7 +782,7 @@ for exe in $exes; do
     exeArgs="-p 64 256 1"
     ncuArgs="-p 64 256 1"
     # For ggttgg (NEW): on GPU test both "64 256" and "2048 256" for ggttgg as the latter gives ~10% higher throughput on cuda110/gcc92
-    exeArgs2="-p 2048 256 1"
+    ###exeArgs2="-p 2048 256 1" # Sep 2025: this aborts (and is not needed as the plateau is reached earlier) with helicity streams
   elif [ "${exe%%/gg_ttg*}" != "${exe}" ]; then 
     # For ggttg, as on ggttgg: this is a good GPU middle point: tput is 1.5x lower with "32 256 1", only a few% higher with "128 256 1"
     ###exeArgs="-p 64 256 1" # too short! see https://its.cern.ch/jira/browse/BMK-1056
@@ -760,9 +816,16 @@ for exe in $exes; do
       unset OMP_NUM_THREADS
     fi
   elif [[ "${exe%%/check_cuda*}" != "${exe}" || "${exe%%/check_hip*}" != "${exe}" ]] || [ "${exe%%/alpcheck*}" != "${exe}" ]; then
+    echo "........................................................................."
     runNcu $exe "$ncuArgs"
-    if [ "${div}" == "1" ]; then runNcuDiv $exe; fi
-    if [ "${req}" == "1" ]; then runNcuReq $exe "$ncuArgs"; fi
+    if [ "${div}" == "1" ]; then
+      echo "........................................................................."
+      runNcuDiv $exe
+    fi
+    if [ "${req}" == "1" ]; then
+      echo "........................................................................."
+      runNcuReq $exe "$ncuArgs"
+    fi
     if [ "${exeArgs2}" != "" ]; then echo "........................................................................."; runExe $exe "$exeArgs2"; fi
   fi
   if [ "${gtest}" == "1" ]; then
@@ -777,6 +840,51 @@ for exe in $exes; do
     cmpExe $exe
   fi
 done
+###echo "========================================================================="
+
+# (2) SCALING TESTS
+lastExeDir=
+for exe in $exesSc; do
+  if [ "$(basename $(dirname $exe))" != "$lastExeDir" ]; then
+    echo "========================================================================="
+    lastExeDir=$(basename $(dirname $exe))
+  else
+    echo "-------------------------------------------------------------------------"
+  fi
+  echo "scalingTest $exe"
+  if [ ! -f $exe ]; then echo "Not found: $exe"; continue; fi
+  if [ "${unamep}" != "x86_64" ]; then
+    if [ "${exe/build.avx2}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi
+    if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi
+    if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi
+  elif [ "${unames}" == "Darwin" ]; then
+    if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported on ${unames}"; continue; fi
+    if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported on ${unames}"; continue; fi
+  elif [ "$(grep -m1 -c avx512vl /proc/cpuinfo)" != "1" ]; then
+    if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported (no avx512vl in /proc/cpuinfo)"; continue; fi
+    if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported (no avx512vl in /proc/cpuinfo)"; continue; fi
+  fi
+  exeDir=$(dirname $exe)
+  cd $exeDir/.. # workaround for reading '../../Cards/param_card.dat' without setting MG5AMC_CARD_PATH
+  unset OMP_NUM_THREADS
+  # Scaling test with 256 threads per block
+  if [[ "${exe%%/check_cuda*}" != "${exe}" || "${exe%%/check_hip*}" != "${exe}" ]]; then
+    echo "### GPU: scaling test 256"
+    for b in 1 2 4 8 16 32 64 128 256 512 1024; do ( $exe -p $b 256 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 256}" ) |& sed "s/Gpu.*Assert/Assert/" |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/'; done
+    if [[ "${exe%%/check_hip*}" != "${exe}" ]]; then
+      echo "### GPU: scaling test 64"
+      for b in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096; do ( $exe -p $b 64 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 64}" ) |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/'; done # HIP (AMD GPU warp size is 32)
+    else
+      echo "### GPU: scaling test 32"
+      for b in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192; do ( $exe -p $b 32 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 32}" ) |& sed "s/Gpu.*Assert/Assert/"; done # CUDA (NVidia GPU warp size is 32)
+    fi
+  else
+    echo "### CPU: scaling test 256"
+    for b in 1 2 4; do ( $exe -p $b 256 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 256}" ); done
+    echo "### CPU: scaling test 32"
+    for b in 1 2 4; do ( $exe -p $b 32 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 32}" ); done
+  fi
+done
 echo "========================================================================="
 
 # Workaround for reading of data files